In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

In [5]:
df = pd.read_csv('./dataset/preprocessed_titanic.csv')
df.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0.0,3.0,male,22.0,1.0,0.0,7.25,S,Third,man,True,C,Southampton,no,False
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1.0,3.0,female,26.0,0.0,0.0,7.925,S,Third,woman,False,C,Southampton,yes,True
3,1.0,1.0,female,35.0,1.0,0.0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0.0,3.0,male,35.0,0.0,0.0,8.05,S,Third,man,True,C,Southampton,no,True


In [6]:
# Drop unnecessary columns
df = df.drop(columns=['deck', 'embark_town', 'class', 'who', 'adult_male', 'alive'])

In [8]:
# Family size 
df['family_size'] = df['sibsp'] + df['parch'] + 1

# Is alone 
df['is_alone'] = (df['family_size'] == 1).astype(int)

# Age group 
df['age_group'] = pd.cut(df['age'], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])

# Fare per person 
df['fare_per_person'] = df['fare'] / df['family_size']

# Fill missing values in age and fare columns
df = df.assign(
    age=df['age'].fillna(df['age'].median()),
    fare=df['fare'].fillna(df['fare'].median()),
    embarked=df['embarked'].fillna(df['embarked'].mode()[0])
)

In [15]:
numerical_features = ['age', 'fare', 'family_size', 'fare_per_person']
categorical_features = ['embarked', 'sex', 'age_group']
binary_features = ['alone', 'is_alone']

In [16]:
# Pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
# Pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Encoding binary features manually
df['sex'] = df['sex'].map({'male': 1, 'female': 0})

In [20]:
# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
    ],
    remainder='passthrough' 
)

In [None]:
# Fit and transform the data
processed_data = preprocessor.fit_transform(df)

# Get the names of the transformed features
onehot_feature_names = preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(categorical_features)

# Combine feature names
processed_columns = numerical_features + list(onehot_feature_names) + binary_features

# Create DataFrame with processed data
processed_df = pd.DataFrame(processed_data, columns=processed_columns)

In [None]:
processed_df