In [1]:
import pandas as pd
import math

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
list(df)

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

In [4]:
df = df[['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'Survived']]

In [5]:
mapping_dict = df.groupby('Pclass')['Age'].mean().to_dict()

In [6]:
mapping_dict

{1: 38.233440860215055, 2: 29.87763005780347, 3: 25.14061971830986}

In [7]:
mapping_dict.get(1)

38.233440860215055

In [8]:
def fill_age(row):
    if (math.isnan(row['Age'])):
        return mapping_dict.get(row['Pclass'])
    return row['Age']

In [9]:
df['Age'] = df.apply(fill_age, axis=1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Name      891 non-null    object 
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Embarked  889 non-null    object 
 7   Survived  891 non-null    int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 55.8+ KB


In [11]:
df.drop('Name', axis=1, inplace=True)

In [12]:
df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Survived
0,3,male,22.00000,1,0,S,0
1,1,female,38.00000,1,0,C,1
2,3,female,26.00000,0,0,S,1
3,1,female,35.00000,1,0,S,1
4,3,male,35.00000,0,0,S,0
...,...,...,...,...,...,...,...
886,2,male,27.00000,0,0,S,0
887,1,female,19.00000,0,0,S,1
888,3,female,25.14062,1,2,S,0
889,1,male,26.00000,0,0,C,1


In [14]:
pd.get_dummies(df['Sex'])

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1
...,...,...
886,0,1
887,1,0
888,1,0
889,0,1


In [16]:
gender_series = pd.get_dummies(df['Sex'], drop_first=True)

In [17]:
embarked_series = pd.get_dummies(df['Embarked'], drop_first=True)

In [19]:
df = pd.concat([df, gender_series, embarked_series], axis = 1)

In [21]:
df.drop(['Sex', 'Embarked'], axis=1, inplace=True)

In [28]:
final = df[['Pclass', 'Age', 'SibSp', 'Parch', 'male', 'Q', 'S', 'Survived']]

In [29]:
final

Unnamed: 0,Pclass,Age,SibSp,Parch,male,Q,S,Survived
0,3,22.00000,1,0,1,0,1,0
1,1,38.00000,1,0,0,0,0,1
2,3,26.00000,0,0,0,0,1,1
3,1,35.00000,1,0,0,0,1,1
4,3,35.00000,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...
886,2,27.00000,0,0,1,0,1,0
887,1,19.00000,0,0,0,0,1,1
888,3,25.14062,1,2,0,0,1,0
889,1,26.00000,0,0,1,0,0,1


In [30]:
final.to_csv('cleaned_data.csv', index=False)