# 1. Handling Missing Data in Titanic Dataset

Importing the libraries

In [2]:
import pandas as pd

Loading the dataset

In [3]:
titanic = pd.read_csv('./datasets/titanic.csv')

Display the first few rows of the dataset

In [12]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


Identifying missing values

In [5]:
missing_values = titanic.isnull().sum()
print("Missing values in each column:", missing_values, sep="\n")

Missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


Strategy 1: Mean/Median Imputation for numerical columns

In [6]:
titanic['Age'].fillna(titanic['Age'].mean(), inplace=True)

Strategy 2: Mode Imputation for categorical columns

In [7]:
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)

Strategy 3: Dropping rows/columns

In [8]:
# Drop rows where 'Cabin' is missing (too many missing values)
titanic.drop(columns='Cabin', inplace=True)

# Drop rows where any column has missing values (for remaining columns)
titanic.dropna(inplace=True)

Display the dataset after handling missing values

In [9]:
print("\nDataset after handling missing values:", titanic.head(), sep='\n')


Dataset after handling missing values:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Embarked  
0      0         A/5 21171   7.2500        S  
1      0          PC 17599  71.2833        C  
2      0  STON/O2. 3101282   7.9250        S  
3      0            113803  53.1000        S  
4      0            373450   8.0500        S 

In [10]:
print("\nMissing values after handling:", titanic.isnull().sum(), sep='\n')


Missing values after handling:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
