# Data Wrangling

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [32]:


ship = sns.load_dataset('titanic')
ship1 = ship
ship2 = ship

In [33]:
ship.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [34]:
# simple operation  to add some value in column
(ship["age"]+10).head(5)

0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64

# Dealing With missing values
- remove missing values
- replace with mean or mode etc
- try to fill missing values for good results.

In [35]:
# how to check missing values

ship.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [36]:
ship.dropna(subset=['deck'],axis=0,inplace=True)
ship

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0,1,male,33.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [37]:
ship = ship.dropna().isnull().sum()

In [38]:
ship.isnull().sum()
ship

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [39]:
ship1.isnull().sum()

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [40]:
# to fill missing value of age we are using mean dunction

mean=ship1['age'].mean()
mean

35.77945652173913

In [41]:
ship1['age']=ship1['age'].replace(np.nan,mean)

In [42]:
ship1.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       2
class          0
who            0
adult_male     0
deck           0
embark_town    2
alive          0
alone          0
dtype: int64

# Data Formatting
- Ensure data is consistent and Understandable

In [43]:
ship1.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [44]:
# Covert data type into another
ship1['survived']=ship1.survived.astype("float64")
ship1.dtypes

survived        float64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [45]:
# to covert age into days of person
ship1['age']=ship1['age']*365
ship1

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1.0,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1.0,1,female,12775.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
6,0.0,1,male,19710.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1.0,3,female,1460.0,1,1,16.7000,S,Third,child,False,G,Southampton,yes,False
11,1.0,1,female,21170.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1.0,1,female,17155.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False
872,0.0,1,male,12045.0,0,0,5.0000,S,First,man,True,B,Southampton,no,True
879,1.0,1,female,20440.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
887,1.0,1,female,6935.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [46]:
# change the name of column

ship1.rename(columns={"age":"age in days"},inplace=True)
ship1.head()

Unnamed: 0,survived,pclass,sex,age in days,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1.0,1,female,13870.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1.0,1,female,12775.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0.0,1,male,19710.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1.0,3,female,1460.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1.0,1,female,21170.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


# Data Normalization

- uniform the Data
- they have same impact
- Make a good comparison

In [47]:
ship4 = ship1[['age in days','fare']]
ship4

Unnamed: 0,age in days,fare
1,13870.0,71.2833
3,12775.0,53.1000
6,19710.0,51.8625
10,1460.0,16.7000
11,21170.0,26.5500
...,...,...
871,17155.0,52.5542
872,12045.0,5.0000
879,20440.0,83.1583
887,6935.0,30.0000


# Method of Normalization

1. Simple feature scaling
    - x(new)=x(old)/x(max)
2. Min-Max method
3. Z-score(standard score) (-3 t0 +3)
4. Log transformation)

In [48]:
# simple scaling
ship4['fare'] = ship4['fare']/ship4['fare'].max()
ship4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ship4['fare'] = ship4['fare']/ship4['fare'].max()


Unnamed: 0,age in days,fare
1,13870.0,0.139136
3,12775.0,0.103644
6,19710.0,0.101229
10,1460.0,0.032596
11,21170.0,0.051822
...,...,...
871,17155.0,0.102579
872,12045.0,0.009759
879,20440.0,0.162314
887,6935.0,0.058556


In [49]:
# min-max method

ship4['fare'] = (ship4['fare']-ship4['fare'].min()) / (ship4['fare'].max()-ship4['fare'].min())
ship4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ship4['fare'] = (ship4['fare']-ship4['fare'].min()) / (ship4['fare'].max()-ship4['fare'].min())


Unnamed: 0,age in days,fare
1,13870.0,0.139136
3,12775.0,0.103644
6,19710.0,0.101229
10,1460.0,0.032596
11,21170.0,0.051822
...,...,...
871,17155.0,0.102579
872,12045.0,0.009759
879,20440.0,0.162314
887,6935.0,0.058556


In [50]:
# z-score
ship4['fare'] = (ship4['fare']-ship4['fare'].mean()) / ship4['fare'].std()
ship4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ship4['fare'] = (ship4['fare']-ship4['fare'].mean()) / ship4['fare'].std()


Unnamed: 0,age in days,fare
1,13870.0,-0.067879
3,12775.0,-0.311883
6,19710.0,-0.328489
10,1460.0,-0.800339
11,21170.0,-0.668161


In [51]:
ship4['fare'] = np.log(ship4['fare'])
ship4.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ship4['fare'] = np.log(ship4['fare'])


Unnamed: 0,age in days,fare
1,13870.0,
3,12775.0,
6,19710.0,
10,1460.0,
11,21170.0,


# Binning
- make categories like age is cahnged into elder younger and childrens

# converting categories into dummy values
- male female to 0 and 1


In [52]:
pd.get_dummies(ship1['sex'])

Unnamed: 0,female,male
1,1,0
3,1,0
6,0,1
10,1,0
11,1,0
...,...,...
871,1,0
872,0,1
879,1,0
887,1,0
