In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,LabelEncoder  

import os

In [4]:
df = pd.read_csv('./dataset/preprocessed_titanic.csv')
df.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0.0,3.0,male,22.0,1.0,0.0,7.25,S,Third,man,True,C,Southampton,no,False
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1.0,3.0,female,26.0,0.0,0.0,7.925,S,Third,woman,False,C,Southampton,yes,True
3,1.0,1.0,female,35.0,1.0,0.0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0.0,3.0,male,35.0,0.0,0.0,8.05,S,Third,man,True,C,Southampton,no,True


In [None]:
# making family size feature
df['family_size'] = df['sibsp'] + df['parch'] +1
# is alone feature
df['is_alone'] = (df['family_size'] == 1).astype(int)

# binning age and fare
df['age_group'] =pd.cut(df['age'], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])
df['fare_group'] = pd.qcut(df['fare'], q=3, labels=['Low Fare', 'Medium Fare', 'High Fare'])
df.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,family_size,is_alone,age_group,fare_group
0,0.0,3.0,male,22.0,1.0,0.0,7.25,S,Third,man,True,C,Southampton,no,False,1.0,0,Adult,Low Fare
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,1.0,0,Adult,High Fare
2,1.0,3.0,female,26.0,0.0,0.0,7.925,S,Third,woman,False,C,Southampton,yes,True,0.0,1,Adult,Low Fare
3,1.0,1.0,female,35.0,1.0,0.0,53.1,S,First,woman,False,C,Southampton,yes,False,1.0,0,Adult,High Fare
4,0.0,3.0,male,35.0,0.0,0.0,8.05,S,Third,man,True,C,Southampton,no,True,0.0,1,Adult,Low Fare
5,0.0,3.0,male,29.699118,0.0,0.0,8.4583,Q,Third,man,True,C,Queenstown,no,True,0.0,1,Adult,Low Fare
6,0.0,1.0,male,54.0,0.0,0.0,51.8625,S,First,man,True,E,Southampton,no,True,0.0,1,Adult,High Fare
7,0.0,3.0,male,2.5,3.0,1.0,21.075,S,Third,child,False,C,Southampton,no,False,4.0,0,Child,Medium Fare
8,1.0,3.0,female,27.0,0.0,2.0,11.1333,S,Third,woman,False,C,Southampton,yes,False,2.0,0,Adult,Medium Fare
9,1.0,2.0,female,14.0,1.0,0.0,30.0708,C,Second,child,False,C,Cherbourg,yes,False,1.0,0,Teen,High Fare


In [7]:
# Encoding
# One-hot encoding for nominal features
df = pd.get_dummies(df, columns=['sex', 'embarked', 'who', 'embark_town'], drop_first=True)

# Label encoding for ordinal features
le = LabelEncoder()
df['age_group'] = le.fit_transform(df['age_group'])
df['fare_group'] = le.fit_transform(df['fare_group'])

# Display the transformed DataFrame
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,class,adult_male,deck,alive,...,is_alone,age_group,fare_group,sex_male,embarked_Q,embarked_S,who_man,who_woman,embark_town_Queenstown,embark_town_Southampton
0,0.0,3.0,22.0,1.0,0.0,7.25,Third,True,C,no,...,0,0,1,True,False,True,True,False,False,True
1,1.0,1.0,38.0,1.0,0.0,71.2833,First,False,C,yes,...,0,0,0,False,False,False,False,True,False,False
2,1.0,3.0,26.0,0.0,0.0,7.925,Third,False,C,yes,...,1,0,1,False,False,True,False,True,False,True
3,1.0,1.0,35.0,1.0,0.0,53.1,First,False,C,yes,...,0,0,0,False,False,True,False,True,False,True
4,0.0,3.0,35.0,0.0,0.0,8.05,Third,True,C,no,...,1,0,1,True,False,True,True,False,False,True
