In [106]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [107]:
# import data
df = sns.load_dataset("titanic")

In [108]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


# How to know if the dataset has missing values?

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [110]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

# What are the ways to deal with missing data?

# 1. Deleting the column with missing data

The deck column has 688 null values out of the total 891 datapoints. So more than half of the values are null and hence we can simply choose to delete the column.

In [111]:
df = df.drop(['deck'],axis=1)
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
embark_town      2
alive            0
alone            0
dtype: int64

# 2. Deleting the row with missing data

In this method we are deleting rows which has at least one NULL value. This is not the best practice because of the fact that data is information.

In [112]:
# Deletes the rows which has atleast one null value

updated_df = df.dropna(axis=0)
updated_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     712 non-null    int64   
 1   pclass       712 non-null    int64   
 2   sex          712 non-null    object  
 3   age          712 non-null    float64 
 4   sibsp        712 non-null    int64   
 5   parch        712 non-null    int64   
 6   fare         712 non-null    float64 
 7   embarked     712 non-null    object  
 8   class        712 non-null    category
 9   who          712 non-null    object  
 10  adult_male   712 non-null    bool    
 11  embark_town  712 non-null    object  
 12  alive        712 non-null    object  
 13  alone        712 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(5)
memory usage: 69.0+ KB


# 3. Imputing missing values with mean/median

Columns in the dataset which are having numeric continuous values can be replaced with the mean, median, or mode of remaining values in the column. This method can prevent the loss of data compared to the earlier method

In [113]:
df['age'] = df['age'].fillna(df['age'].mean())

In [114]:
df['age'].isnull().sum()

np.int64(0)

# 3.1 Imputing missing values with mean/median of group

In [115]:
df2 = sns.load_dataset('titanic')

In [116]:
#Mean
df2['age'] = df2['age'].fillna(df.groupby('class')['age'].transform('mean'))
#Median
#df['age'] = df['age'].fillna(df.groupby('class')['age'].transform('median'))

  df2['age'] = df2['age'].fillna(df.groupby('class')['age'].transform('mean'))


In [117]:
df2['age'].isnull().sum()

np.int64(0)

# 4. Imputation method for categorical columns

When missing values is from categorical columns (string or numerical) then the missing values can be replaced with the most frequent category.

In [118]:
df3 = sns.load_dataset('titanic')
df3['deck'].value_counts()

deck
C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: count, dtype: int64

In [119]:
#creating a new category 'H' as number of missing values is very big
df3['deck'] = df3['deck'].cat.add_categories(['H'])

In [120]:
df3['deck'].value_counts()

deck
C    59
B    47
D    33
E    32
A    15
F    13
G     4
H     0
Name: count, dtype: int64

In [121]:
df3['deck'] = df3['deck'].fillna('H')

In [122]:
df3['deck'].value_counts()

deck
H    688
C     59
B     47
D     33
E     32
A     15
F     13
G      4
Name: count, dtype: int64

# 5. Forward Fill and Backward Fill

Forward fill (ffill) and backward fill (bfill) are methods used to fill missing values by carrying forward the last observed non-missing value (for ffill) or by carrying backward the next observed non-missing value (for bfill). These methods are particularly useful for time-series data.

In [123]:
# Create a sample DataFrame
data = {
    'A': [1, 2, np.nan, 4, 5, np.nan, 7],
    'B': ['a', 'b', 'c', np.nan, 'e', 'f', 'g'],
    'C': [np.nan, 1, 2, 3, np.nan, 5, 6]
}

df = pd.DataFrame(data)

In [124]:
print("Original DataFrame:")
print(df)

Original DataFrame:
     A    B    C
0  1.0    a  NaN
1  2.0    b  1.0
2  NaN    c  2.0
3  4.0  NaN  3.0
4  5.0    e  NaN
5  NaN    f  5.0
6  7.0    g  6.0


In [125]:
# Forward fill missing values in column 'A'
df['A'] = df['A'].ffill()

In [126]:
print("\nDataFrame after forward filling missing values in column 'A':")
print(df)


DataFrame after forward filling missing values in column 'A':
     A    B    C
0  1.0    a  NaN
1  2.0    b  1.0
2  2.0    c  2.0
3  4.0  NaN  3.0
4  5.0    e  NaN
5  5.0    f  5.0
6  7.0    g  6.0


In [128]:
# Backward fill missing values in column 'C'
df['C'] = df['C'].bfill()

In [129]:
print("\nDataFrame after backward filling missing values in column 'C':")
print(df)


DataFrame after backward filling missing values in column 'C':
     A    B    C
0  1.0    a  1.0
1  2.0    b  1.0
2  2.0    c  2.0
3  4.0  NaN  3.0
4  5.0    e  5.0
5  5.0    f  5.0
6  7.0    g  6.0
