In [98]:
import pandas as pd
import numpy as np
import seaborn as sns

In [99]:
kashti = sns.load_dataset('titanic')

In [100]:
ks1 = kashti.copy()
ks2 = kashti.copy()

In [101]:
kashti.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Simple Operations (Math operator)

In [102]:
# Simply add 1 in the whole series of age
(kashti['age']+1).head()

0    23.0
1    39.0
2    27.0
3    36.0
4    36.0
Name: age, dtype: float64

# Dealing with missing values
- in a data set missing values are either ? or N/A or NaN, or 0 or a blank cell.

> Steps:
1. Try to recollect the data if possible to remove the error or missing values
2. If the column with missing values is not important in data, remove the whole column
3. Replace the missing values:
   1. How to replace the missing values?
      - Average value of entire variable or similar data point
      - Frequency or MODE replacement
      - Replace based on other functions (Data sampler knows that)
      - ML algorithm can also be used to figure out the missing values
      - Leave it as it is

   2. Why to replace the missing values?
      - It's better to have less lost data and more valueable data
      - Data with missing values is less accurate.


In [103]:
# Where exactly missing values are in our DataFrame?
# DF.isnull().sum()
kashti.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [104]:
# Dropping missing values

# check the shape of data before removing missing values
print(kashti.shape)

# drop rows in deck column with missing values; axis=0 means drop rows
kashti.dropna(subset=['deck'], axis=0, inplace=True)
print(kashti.shape)

(891, 15)
(203, 15)


In [105]:
# dropping a whole column
ks1.copy().head().drop(columns=['deck']).columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')

In [106]:
kashti.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True


In [107]:
kashti.isnull().sum()

survived        0
pclass          0
sex             0
age            19
sibsp           0
parch           0
fare            0
embarked        2
class           0
who             0
adult_male      0
deck            0
embark_town     2
alive           0
alone           0
dtype: int64

In [108]:
# dropping all na values
# caution: this may dramatically reduce the data size if called with no arguments
# as it will remove all the rows containing any null value in any column of the data set
# DF.dropna()

kashti.dropna(inplace=True)
kashti.isnull().sum()


survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

In [109]:
# see, dropna can reduced data from 891 rows to 182 rows in this particular data set
kashti.shape

(182, 15)

In [110]:
mean_age = ks1['age'].mean()
mean_age

29.69911764705882

In [111]:
# ks2.copy()['age'].replace(np.nan, mean_age)
ks2.loc[ks2['age'] == np.nan]  # this returns zero records, then how the replace command is working?

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone


In [135]:
ks1['age'].replace(np.nan, mean_age)  # this actually replaces the nan but the above shows zero records, how is it possible?
ks1['age'].isnull().sum()

0

In [112]:
# replacing values of age column with average value of the same column
ks1['age'] = ks1['age'].replace(np.nan, mean_age)

In [113]:
# See age null values has been replaced with mean value, so null values are zero now
ks1.isnull().sum()


survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [114]:
# replacing with mean value, saves us from dropping 177 records of data
ks1.shape

(891, 15)

In [115]:
deck_mode = ks1['deck'].mode().values[0]
deck_mode

'C'

In [147]:
# since deck value is a string value, so we can't compute its mean
# we will replace it with mode
# ks1['deck'] = ks2['deck']
# ks1['deck'].isnull()
# ks2.loc[ks2['age'].isnull()]
# ks1.loc[ks1['deck'].isnull()]
# ks1['deck'].value_counts()
# replace is not working in 'deck' column
ks1['deck'].replace(to_replace=np.nan, value=deck_mode)
# ks1['deck'][0] = 'C'
# ks1['deck'][0]
# ks1['deck']

0      NaN
1        C
2      NaN
3        C
4      NaN
      ... 
886    NaN
887      B
888    NaN
889      C
890    NaN
Name: deck, Length: 891, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

In [117]:
print(ks1.isnull().sum())
print(ks1.shape)

survived         0
pclass           0
sex              0
age              0
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64
(891, 15)


as shown in the above output, now we have less missing values with more data i.e. 891 rows
lets see embarked and embark town now

In [118]:
# ks1.loc[:, ['embarked', 'embark_town']]
ks1.loc[ks1['embarked'].isnull()].loc[:, ['embarked', 'embark_town']]

Unnamed: 0,embarked,embark_town
61,,
829,,


In [119]:
# embarked and embark_town are also string, so lets replace them with mode as well
ks1.embark_town

0      Southampton
1        Cherbourg
2      Southampton
3      Southampton
4      Southampton
          ...     
886    Southampton
887    Southampton
888    Southampton
889      Cherbourg
890     Queenstown
Name: embark_town, Length: 891, dtype: object

In [120]:
# ks1.embarked.mode().values[0]
ks1.embarked = ks1.embarked.mode().values[0]