# Data Wrangling 

In [161]:
# import libraries 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

In [174]:
kashti = sns.load_dataset("titanic")
ks1 = kashti 
# # ks2 = kashti 

In [163]:
kashti.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [164]:
# simple operations (Math operations)
(kashti["age"]+12).head(10) 

0    34.0
1    50.0
2    38.0
3    47.0
4    47.0
5     NaN
6    66.0
7    14.0
8    39.0
9    26.0
Name: age, dtype: float64

In [165]:
(kashti["age"]*2).head(3) 

0    44.0
1    76.0
2    52.0
Name: age, dtype: float64

# Dealing with missing values 

- In a dat set missing values are either ? or N/A or NAN , or 0 or a blank cell.
- Jab kabhi data na ho kisi aik row main kisi b aik parameter ka 

> Steps:

 1- Koshsih kren dobara data collect kar len agar kahin ghalti hai.

2- missing value wala variable (coloumn) hi nikal den agr data per effect nahi hta ya simple row or data entry remove kr den .

3- Replace the missing values :

    - How?
        1. Average value of entire variable or similar data poit 
        2. frequency or MODE replacement 
        3. Replace based on other functions (Data sampler knows that)
        4. ML algorithm can also be used 
        5. Leave it like that 
    
    - Why?
        1. Its better beacuse no data is lost 
        2. Less accurate

In [166]:
# where exactly missing value are ?
kashti.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [167]:
# use a drop.na method 
print (kashti.shape)

(891, 15)


In [None]:
# removing missing value column 
# ks_clean= ks.drop (["deck"], axis= 1)
# ks_clean.head()

In [168]:
kashti.dropna(subset=["deck"], axis=0 , inplace = True)
kashti.shape

(203, 15)

In [169]:
kashti.isnull().sum()

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [170]:
# removing na from whole dataframe
kashti = kashti.dropna()
kashti.isnull().sum()


survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [171]:
kashti.shape

(182, 15)

In [175]:
ks1.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

# Replacing missing values with the average of that column

In [176]:
# finding an average (mean)
mean = ks1["age"].mean()
mean

29.69911764705882

In [177]:
# replacing Nan with mean of the data (udating as well)
ks1["age"] = ks1["age"].replace(np.nan , mean)

In [178]:
ks1["age"].head(10)

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: age, dtype: float64

In [179]:
ks1.isnull().sum()

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

## Assignment 

In [180]:
mode = ks1["deck"].mode()[0]
mode

'C'

In [181]:
ks1['deck'].fillna(mode , inplace=True)
ks1['deck'].head(12)

0     C
1     C
2     C
3     C
4     C
5     C
6     E
7     C
8     C
9     C
10    G
11    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [182]:
ks1.dropna(subset=["embarked"], axis=0 , inplace = True)

In [183]:
ks1.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

some others 

```
cols = ["workclass", "native-country"]
df[cols]=df[cols].fillna(df.mode().iloc[0])
```
or we can use a mapping
```
ks1['deck'].map({"A" : np.nan})
```

# Data Formatting
- Data ko aik common standard per lana 
- Ensure data is consistent and understandable 
    - Easy to gather 
    - Easy to work with 
      - Faisalabad (FSD)
      - Lahore (LHR)
      - Islamabad (ISB)
      - karachi (KCH)
      - Peshawar (PEW)
      - converting g to kg or similar uni for all 
      - one standard unit in each column
      - ft != cm
  

In [184]:
# know the data type and convert it into the know one 
kashti.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [186]:
# use this method to convert datatype from one to another format 
kashti["survived"]= kashti["survived"].astype("float64")
kashti.dtypes

survived        float64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [190]:
# here we will convert age into days instead of years 
ks1["age"] = ks1["age"]*365
ks1["age"].head(8)

0     8030.000000
1    13870.000000
2     9490.000000
3    12775.000000
4    12775.000000
5    10840.177941
6    19710.000000
7      730.000000
Name: age, dtype: float64

In [191]:
# always rename afterwards 
ks1.rename(columns={"age": "age in days"}, inplace=True)
ks1.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0.0,3,male,8030.0,1,0,7.25,S,Third,man,True,C,Southampton,no,False
1,1.0,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1.0,3,female,9490.0,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True
3,1.0,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0.0,3,male,12775.0,0,0,8.05,S,Third,man,True,C,Southampton,no,True


In [192]:
ks1["age in days"]= ks1["age in days"].astype("int64")
ks1.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0.0,3,male,8030,1,0,7.25,S,Third,man,True,C,Southampton,no,False
1,1.0,1,female,13870,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1.0,3,female,9490,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True
3,1.0,1,female,12775,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0.0,3,male,12775,0,0,8.05,S,Third,man,True,C,Southampton,no,True


## Data Normailization 
- Uniform the data 
- Making sure they have same impact 
- AIk machli samundar me or aik jar main 
- Also for computational reasons 

In [193]:
kashti.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0.0,3,male,8030,1,0,7.25,S,Third,man,True,C,Southampton,no,False
1,1.0,1,female,13870,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1.0,3,female,9490,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True
3,1.0,1,female,12775,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0.0,3,male,12775,0,0,8.05,S,Third,man,True,C,Southampton,no,True


In [195]:
ks4= kashti[["age in days","fare"]]
ks4.head()

Unnamed: 0,age in days,fare
0,8030,7.25
1,13870,71.2833
2,9490,7.925
3,12775,53.1
4,12775,8.05


- The above data is really in wide range and we need to normalize and hard to compare 
- Normalization change the values to the range of 0-1  (now both variable has similar influence on our models)

## Method of Normalization 

1. Simple feature scaling 
    - x(new)= x(old)/x(max )
2. Min-Max Method 
3. Z-score (standard score) -3 to +3 
4. Log transformation 

In [196]:
# simple feature scalling 
ks4["fare"]=  ks4["fare"]/ks4["fare"].max()
ks4["age in days"]=  ks4["age in days"]/ks4["age in days"].max()
ks4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4["fare"]=  ks4["fare"]/ks4["fare"].max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4["age in days"]=  ks4["age in days"]/ks4["age in days"].max()


Unnamed: 0,age in days,fare
0,0.275,0.014151
1,0.475,0.139136
2,0.325,0.015469
3,0.4375,0.103644
4,0.4375,0.015713


In [197]:
# Min- Max method 
# x.new = (x.old - x.min) / (x.max- x.min)
ks4["fare"] = (ks4["fare"]-ks4["fare"].min())/ (ks4["fare"].max()-ks4["fare"].min())
ks4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4["fare"] = (ks4["fare"]-ks4["fare"].min())/ (ks4["fare"].max()-ks4["fare"].min())


Unnamed: 0,age in days,fare
0,0.275,0.014151
1,0.475,0.139136
2,0.325,0.015469
3,0.4375,0.103644
4,0.4375,0.015713


In [198]:
# z-score method 
# x.new = (x.old -x.mean) / x.std
ks4["fare"]  =(ks4["fare"]- ks4["fare"].mean()) / ks4["fare"].std()
ks4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ks4["fare"]  =(ks4["fare"]- ks4["fare"].mean()) / ks4["fare"].std()


Unnamed: 0,age in days,fare
0,0.275,-0.499958
1,0.475,0.788503
2,0.325,-0.486376
3,0.4375,0.422623
4,0.4375,-0.483861


In [200]:
ks =sns.load_dataset("titanic")
ks["fare"].head() 

0     7.2500
1    71.2833
2     7.9250
3    53.1000
4     8.0500
Name: fare, dtype: float64

In [201]:
# log transfromation
# x.new = np.log(x.old) 
ks["fare"] = np.log(ks["fare"])
ks["fare"].head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


0    1.981001
1    4.266662
2    2.070022
3    3.972177
4    2.085672
Name: fare, dtype: float64

## Binning 
- Grouping of values into smaller number of vvalues (bins)
- Convert numeric into categories ( jawan, achay , booray ) or 1- 16, 17-40 etc
- to have a better understanding of groups 
    - low vs mid vs high price 

In [252]:
kashti.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,Binned
0,0.0,3,male,bachay,1,0,7.25,S,Third,man,True,C,Southampton,no,False,bachay
1,1.0,1,female,jawan,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,jawan
2,1.0,3,female,bachay,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True,bachay
3,1.0,1,female,jawan,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,jawan
4,0.0,3,male,jawan,0,0,8.05,S,Third,man,True,C,Southampton,no,True,jawan


In [221]:
kashti["age in days"].shape

(889,)

In [256]:
# Creating bins 
bins = np.linspace(min(ks1["age in days"]), max(ks1["age in days"]), 4)
bins

array([  153.        ,  9835.33333333, 19517.66666667, 29200.        ])

In [257]:
age_groups = ["bachay", "jawan", "Boorhay"]
ks1["age in days"] = pd.cut(ks1["age in days"], bins, labels= age_groups ,include_lowest=True )
ks1["age in days"]


0      bachay
1       jawan
2      bachay
3       jawan
4       jawan
        ...  
886     jawan
887    bachay
888     jawan
889    bachay
890     jawan
Name: age in days, Length: 889, dtype: category
Categories (3, object): ['bachay' < 'jawan' < 'Boorhay']

In [248]:
kashti.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,Binned
0,0.0,3,male,bachay,1,0,7.25,S,Third,man,True,C,Southampton,no,False,bachay
1,1.0,1,female,jawan,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,jawan
2,1.0,3,female,bachay,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True,bachay
3,1.0,1,female,jawan,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,jawan
4,0.0,3,male,jawan,0,0,8.05,S,Third,man,True,C,Southampton,no,True,jawan


### Converting catagories into dummies
- easy to use for computation 
- male, Femal (0,1)

In [215]:
ks1.head()

Unnamed: 0,survived,pclass,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,female,male
0,0.0,3,8030,1,0,7.25,S,Third,man,True,C,Southampton,no,False,0,1
1,1.0,1,13870,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1,0
2,1.0,3,9490,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True,1,0
3,1.0,1,12775,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1,0
4,0.0,3,12775,0,0,8.05,S,Third,man,True,C,Southampton,no,True,0,1


In [216]:
ks5 = sns.load_dataset("titanic")

In [217]:
data = pd.get_dummies(ks5["sex"])

In [218]:
# Drop column 
ks5 = ks5.drop('sex',axis = 1)
# Join 
ks5 = ks5.join(data)
ks5.head() 

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,female,male
0,0,3,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,1
1,1,1,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1,0
2,1,3,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,1,0
3,1,1,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,1,0
4,0,3,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,1
