In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import numpy as np
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [22]:
df = pd.read_csv("../data/train.csv")
df.drop('Name', axis=1, inplace=True)
df.drop('PassengerId', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df.drop('Embarked', axis=1, inplace=True)


print(df.info())
print(df.describe(include='all').T)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB
None
          count unique      top freq       mean        std   min     25%  \
Survived  891.0    NaN      NaN  NaN   0.383838   0.486592   0.0     0.0   
Pclass    891.0    NaN      NaN  NaN   2.308642   0.836071   1.0     2.0   
Sex         891      2     male  577        NaN        NaN   NaN     NaN   
Age       714.0    NaN      NaN  NaN  29.699118  14.526497  0.42  20.125   
SibSp     891.0    NaN      NaN  NaN   0.523008   1.102743

In [23]:
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
#df['Embarked'] = df['Embarked'].map({'C': 1, 'Q': 2, 'S': 3})

In [24]:
# Define features and target
features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Age', 'Cabin']
X = df[features]
y = df['Survived']

In [25]:

def add_cabin_info_columns(df2):
    df_copy = df2.copy()  # Create a copy to avoid modifying the original DataFrame

    def extract_cabin_info(cabin):
        if pd.isna(cabin) or cabin == '':
            return 0, 'N'  # No cabin info
        cabins = cabin.split()
        count = len(cabins)
        first_cabin = cabins[0]  # First cabin assignment
        match = re.match(r'([A-G])\d+', first_cabin)  # Only A-G allowed
        if match:
            letter = match.groups()[0]
            return count, letter
        return count, 'N'

    # Apply transformation and expand into new columns
    df_copy[['cabin_count', 'cabin_letter']] = df_copy['Cabin'].apply(
        lambda x: pd.Series(extract_cabin_info(x))
    )

    # Drop original 'Cabin' column
    df_copy.drop('Cabin', axis=1, inplace=True)

    # Encode `cabin_letter` as numerical values
    cabin_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'N': 7}
    df_copy['cabin_letter'] = df_copy['cabin_letter'].map(cabin_mapping).astype(int)

    return df_copy

In [26]:
# Create cross-validation object
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Store model results
model_results = {}

# **Model 1: Drop Rows with Missing Age & drops column "Cabin"**
X1 = X.drop(columns=['Cabin'])
X1 = X1.dropna()
y1 = y[X1.index]  # Align target variable
rf1 = RandomForestClassifier(random_state=42)
scores1 = cross_val_score(rf1, X1, y1, cv=cv, scoring='accuracy')
model_results['Ignore Age & drop Cabin'] = scores1.mean()

# **Model 2: Drop Cabin, Impute Age**
X2 = X.drop(columns=['Cabin'])
X2['Age'] = X2.groupby(['Pclass', 'Sex'])['Age'].transform('median')  # Corrected Imputation
y2 = y
rf2 = RandomForestClassifier(random_state=42)
scores2 = cross_val_score(rf2, X2, y2, cv=cv, scoring='accuracy')
model_results['Drop Cabin, Impute Age'] = scores2.mean()

# **Model 3: Drop Age missing values, "Impute" Cabin**
X3 = X.copy()
X3 = add_cabin_info_columns(X3)
X3 = X3.dropna()
y3 = y.loc[X3.index] 
rf3 = RandomForestClassifier(random_state=42)
scores3 = cross_val_score(rf3, X3, y3, cv=cv, scoring='accuracy')
model_results['Ignore Age missing, impute Cabin'] = scores3.mean()

# **Model 4: Impute Age & Cabin**
X4 = X.copy()
X4['Age'] = X4.groupby(['Pclass', 'Sex'])['Age'].transform('median')  # Corrected Imputation
X4 = add_cabin_info_columns(X4)
y4 = y
rf4 = RandomForestClassifier(random_state=42)
scores4 = cross_val_score(rf4, X4, y4, cv=cv, scoring='accuracy')
model_results['Impute Age & Cabin'] = scores4.mean()

# **Model 5: Drop Age & Cabin**
X5 = X.drop(columns=['Age', 'Cabin'])
y5 = y
rf5 = RandomForestClassifier(random_state=42)
scores5 = cross_val_score(rf5, X5, y5, cv=cv, scoring='accuracy')
model_results['Drop Age & Cabin'] = scores5.mean()

# **Model 6: Drop Age & Impute Cabin**
X6= X.drop(columns=['Age'])
X6 = add_cabin_info_columns(X6)
y6 = y
rf6 = RandomForestClassifier(random_state=42)
scores6 = cross_val_score(rf6, X6, y6, cv=cv, scoring='accuracy')
model_results['Drop Age & Impute Cabin'] = scores6.mean()

# **Print Results**
print("Cross-Validation Results:")
for model, score in model_results.items():
    print(f"{model}: {score:.4f}")


Cross-Validation Results:
Ignore Age & drop Cabin: 0.8096
Drop Cabin, Impute Age: 0.7991
Ignore Age missing, impute Cabin: 0.7983
Impute Age & Cabin: 0.8047
Drop Age & Cabin: 0.7991
Drop Age & Impute Cabin: 0.8013
