In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
import seaborn as sns

from joblib import dump, load

In [50]:
df = pd.read_csv("../dataset/preprocessed_titanic.csv")
df.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0.0,3.0,male,22.0,1.0,0.0,7.25,S,Third,man,True,C,Southampton,no,False
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1.0,3.0,female,26.0,0.0,0.0,7.925,S,Third,woman,False,C,Southampton,yes,True
3,1.0,1.0,female,35.0,1.0,0.0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0.0,3.0,male,35.0,0.0,0.0,8.05,S,Third,man,True,C,Southampton,no,True


In [51]:
# Family size feature
df['family_size'] = df['sibsp'] + df['parch'] + 1

# Is alone feature
df['is_alone'] = (df['family_size'] == 1).astype(int)

# Binning age
df['age_group'] = pd.cut(df['age'], bins=[0, 12, 18, 60, 100], labels=['Child', 'Teen', 'Adult', 'Senior'])

# Fare per person
df['fare_per_person'] = df['fare'] / df['family_size']

df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,family_size,is_alone,age_group,fare_per_person
0,0.0,3.0,male,22.0,1.0,0.0,7.25,S,Third,man,True,C,Southampton,no,False,2.0,0,Adult,3.625
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,2.0,0,Adult,35.64165
2,1.0,3.0,female,26.0,0.0,0.0,7.925,S,Third,woman,False,C,Southampton,yes,True,1.0,1,Adult,7.925


In [52]:
# #? Define column groups based on their types.
numerical_cols = ['age', 'fare', 'family_size', 'sibsp', 'parch', 'fare_per_person']
categorical_cols = ['embarked', 'pclass', 'class', 'deck', 'age_group']
binary_cols = ['sex','is_alone']

In [53]:
def numerical_pipeline():
    """Create a pipeline for numerical features with imputation and scaling."""
    return Pipeline(
        [
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", MinMaxScaler(feature_range=(0, 1))),
        ]
    )


def categorical_pipeline():
    """Create a pipeline for categorical features with imputation and encoding."""
    return Pipeline(
        [
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )


def build_preprocessor(numerical_cols, categorical_cols, binary_cols):
    """Combine all transformations into a ColumnTransformer."""
    return ColumnTransformer(
        transformers=[
            ("num", numerical_pipeline(), numerical_cols),
            ("cat", categorical_pipeline(), categorical_cols),
            ("bin", "passthrough", binary_cols),
        ],
        remainder="drop",  # Drop any columns not specified
    )

In [54]:
def preprocess_data(data,numerical_cols, categorical_cols, binary_cols):
    """Execute preprocessing on the dataset and return a transformed DataFrame."""
    preprocessor = build_preprocessor(numerical_cols, categorical_cols, binary_cols)
    transformed_data = preprocessor.fit_transform(data)

    # Get column names for transformed data
    num_cols_transformed = numerical_cols
    cat_cols_transformed = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
    binary_cols_transformed = binary_cols
    
    # Create a DataFrame with transformed data
    transformed_df = pd.DataFrame(transformed_data, columns=np.concatenate([num_cols_transformed, cat_cols_transformed, binary_cols_transformed]))
    return transformed_df

In [55]:
transformed_df = preprocess_data(df,numerical_cols,categorical_cols,binary_cols)
sex = {'male': 1,'female': 0}
transformed_df['sex'] = [sex[item] for item in transformed_df['sex']]

transformed_df.head()

Unnamed: 0,age,fare,family_size,sibsp,parch,fare_per_person,embarked_C,embarked_Q,embarked_S,pclass_1.0,...,deck_C,deck_D,deck_E,deck_F,deck_G,age_group_Adult,age_group_Child,age_group_Teen,sex,is_alone
0,0.375,0.014151,0.1,0.125,0.0,0.007076,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0
1,0.682692,0.139136,0.1,0.125,0.0,0.069568,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
2,0.451923,0.015469,0.0,0.0,0.0,0.015469,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1
3,0.625,0.103644,0.1,0.125,0.0,0.051822,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
4,0.625,0.015713,0.0,0.0,0.0,0.015713,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1


In [56]:
def verify_transformations(original_data, transformed_data):
    """
    Print verification of the transformations applied to the data.
    """
    print("Verification of Transformations:")
    print("-" * 50)
    
    # Verify numerical scaling
    print("Numerical Features (should be scaled, mean ≈ 0, std ≈ 1):")
    numerical_features = transformed_data[numerical_cols]
    print(numerical_features.describe())
    print("\n")
    
    # Verify one-hot encoding
    categorical_features = [col for col in transformed_data.columns 
                          if any(cat_col in col for cat_col in categorical_cols)]
    print("One-hot Encoded Features (should be binary 0/1):")
    print(transformed_data[categorical_features].head())
    print("\n")
    
    # Verify binary encoding
    binary_features = binary_cols
    print("Binary Features (should be 0/1):")
    print(transformed_data[binary_features].head())
    
X = df.drop(['survived'],axis=1)
verify_transformations(X, transformed_df)

Verification of Transformations:
--------------------------------------------------
Numerical Features (should be scaled, mean ≈ 0, std ≈ 1):
              age        fare  family_size  sibsp  parch  fare_per_person
count   891.00000  891.000000        891.0  891.0  891.0       891.000000
unique   66.00000  248.000000          9.0    7.0    7.0       289.000000
top       0.52306    0.015713          0.0    0.0    0.0         0.025374
freq    177.00000   43.000000        537.0  608.0  678.0        59.000000


One-hot Encoded Features (should be binary 0/1):
  embarked_C embarked_Q embarked_S pclass_1.0 pclass_2.0 pclass_3.0  \
0        0.0        0.0        1.0        0.0        0.0        1.0   
1        1.0        0.0        0.0        1.0        0.0        0.0   
2        0.0        0.0        1.0        0.0        0.0        1.0   
3        0.0        0.0        1.0        1.0        0.0        0.0   
4        0.0        0.0        1.0        0.0        0.0        1.0   

  class_Fi

In [57]:
# Save the pipeline and feature names
dump(transformed_df, 'titanic_preprocessor.joblib')
dump(transformed_df.columns, 'feature_names.joblib')
print("\nPipeline and feature names saved as 'titanic_preprocessor.joblib' and 'feature_names.joblib'")



Pipeline and feature names saved as 'titanic_preprocessor.joblib' and 'feature_names.joblib'


In [58]:
transformed_df['survived'] = df['survived']
transformed_df.to_csv('../dataset/full_preprocessed_titanic.csv',index=False)

In [59]:
transformed_df.isnull().sum()

age                0
fare               0
family_size        0
sibsp              0
parch              0
fare_per_person    0
embarked_C         0
embarked_Q         0
embarked_S         0
pclass_1.0         0
pclass_2.0         0
pclass_3.0         0
class_First        0
class_Second       0
class_Third        0
deck_A             0
deck_B             0
deck_C             0
deck_D             0
deck_E             0
deck_F             0
deck_G             0
age_group_Adult    0
age_group_Child    0
age_group_Teen     0
sex                0
is_alone           0
survived           0
dtype: int64