In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [None]:
data = pd.read_csv('data/train.csv')
data_test = pd.read_csv('data/test.csv')

In [None]:
data.describe()

In [None]:
def plot_corr_heatmap(_data, cmap="coolwarm"):
    # Only numeric columns are selected
    numeric_data = _data.select_dtypes(include=["number"])
    sns.heatmap(numeric_data.corr(), center=0, cmap=cmap, linewidth=.5 )
    plt.show()

In [None]:
# Create 'is_female' column.
# Missing age => relationship between sex, parch, sibsp, and class/fare?
# Remove PassengerId, Name, and Ticket since they do not seem to be good sources of information.
# Take a look at Cabin info

In [None]:
plot_corr_heatmap(data)

# Pclass and ticket fare have a noticeable correlation, and a minor one between siblings/spouse and parent/children and age

In [None]:
data.info()

# Age has some null values, we'll try to estimate an average value considering the class, siblings/spouse and parent/children values.
# The cabin value will be ignored. 
# Embarked will be replaced with a default value.

In [None]:
# Create KNN imputer
def age_imputer(data):
    knn_imputer = KNNImputer(n_neighbors=7)
    columns_to_impute = ["Age", "Pclass", "SibSp", "Parch"]
    
    data_imputed = pd.DataFrame(knn_imputer.fit_transform(data[columns_to_impute]), columns=columns_to_impute)
    
    #data_imputed.info()
    data[columns_to_impute]=data_imputed

In [None]:
def is_female_imputer(data):
    data['IsFemale']=data['Sex'].apply(lambda x: 0 if x == 'male' else (1 if x == 'female' else -1))

In [None]:
# This can be done with One Hot Encoding
def embarked_encoder(data):
    data['C'] = data['Embarked'].apply(lambda x: 1 if x == 'C' else 0)
    data['S'] = data['Embarked'].apply(lambda x: 1 if x == 'S' else 0)
    data['Q'] = data['Embarked'].apply(lambda x: 1 if x == 'Q' else 0)
    data['N'] = data['Embarked'].apply(lambda x: 1 if x not in ['C', 'S', 'Q'] else 0)

In [None]:
def feature_dropper(data):
    data.drop(["PassengerId","Name","Sex","Ticket","Cabin","Embarked","cabin_type"], axis=1, inplace=True)

In [None]:
# This can be done with One Hot Encoding
def cabin_info_imputer(data):    
    data["cabin_type"] = data["Cabin"].str[0].fillna("U")
        
    data['CTA'] = data['cabin_type'].apply(lambda x: 1 if x[0] == 'A' else 0)
    data['CTB'] = data['cabin_type'].apply(lambda x: 1 if x[0] == 'B' else 0)
    data['CTC'] = data['cabin_type'].apply(lambda x: 1 if x[0] == 'C' else 0)
    data['CTD'] = data['cabin_type'].apply(lambda x: 1 if x[0] == 'D' else 0)
    data['CTE'] = data['cabin_type'].apply(lambda x: 1 if x[0] == 'E' else 0)
    data['CTF'] = data['cabin_type'].apply(lambda x: 1 if x[0] == 'F' else 0)
    data['CTG'] = data['cabin_type'].apply(lambda x: 1 if x[0] == 'G' else 0)
    data['CTT'] = data['cabin_type'].apply(lambda x: 1 if x[0] == 'T' else 0)
    data['CTN'] = data['cabin_type'].apply(lambda x: 1 if x not in ['A','B','C','D','E','F','G','T'] else 0)

    data["cabin_number"] = data["Cabin"].str.split(" ").apply(lambda x: len(x) if isinstance(x, list) else 0)

    return data

In [None]:
# Is having a cabin number correlated with survival?
data_cabin = cabin_info_imputer(data.copy())
pd.pivot_table(data_cabin, index="Survived", columns="cabin_type", values='Pclass', aggfunc='count')

In [None]:
# Is the number of cabins correlated with survival? 
pd.pivot_table(data_cabin, index="Survived", columns="cabin_number", values='Pclass', aggfunc='count')

In [None]:
def feature_transform(data):
    data['Fare'] = np.log(data.Fare+1)

In [None]:
def imputer(data):
    data_imputer = data.copy()
    age_imputer(data_imputer)
    is_female_imputer(data_imputer)
    cabin_info_imputer(data_imputer)
    embarked_encoder(data_imputer)
    feature_dropper(data_imputer)
    feature_transform(data_imputer)
    
    return data_imputer

In [None]:
data_final = data.copy()

In [None]:
data_final.info()

In [None]:
plot_corr_heatmap(data_final)

In [None]:
data_numeric_columns = data_final.select_dtypes(include=['number']).columns

plot_cols = 2
plot_rows = (len(data_numeric_columns) + plot_cols - 1) // plot_cols
fig, axes = plt.subplots(plot_rows, plot_cols, figsize=(15, 5 * plot_rows))

for i, column in enumerate(data_numeric_columns):
    row, col = divmod(i, plot_cols) 
    ax = axes[row, col] if plot_rows > 1 else axes[col]
    data_final[column].hist(ax=ax, bins=20, color="skyblue", edgecolor="white")
    ax.set_title(column)

for j in range(len(data_numeric_columns), plot_rows * plot_cols):
    row, col = divmod(j, plot_cols)
    fig.delaxes(axes[row, col] if plot_rows > 1 else axes[col])

plt.tight_layout()
plt.show()

# Age seems to have a normal distribution by itself.
# Fare may be transformed into a log distribution (feature_transform function).

In [None]:
data_final = imputer(data)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
def preprocess(data):
    scaler = StandardScaler()
    
    data_numeric_columns = data.select_dtypes(include=['number']).columns

    X_data = data.copy();
    y_data = None;
    
    if "Survived" in data_numeric_columns:        
        X_data = data.drop(["Survived"], axis=1)
        y = data["Survived"]
        
        y_data = y.to_numpy()
    
    X_data_numeric_columns = X_data.select_dtypes(include=['number']).columns
    X_data[X_data_numeric_columns] = scaler.fit_transform(X_data[X_data_numeric_columns])
    
    return X_data, y_data

In [None]:
X_data, y_data = preprocess(data_final)

In [None]:
X_data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()

param_grid = [{
    "n_estimators": [10,100, 200, 500, 1000], 
    "max_depth": [None, 5, 10, 15], 
    "min_samples_split":[2,3,4,5]
}]

grid_search = GridSearchCV(rfc, param_grid, cv=4, scoring="accuracy", return_train_score=True)
grid_search.fit(X_data, y_data)

In [None]:
final_rfc = grid_search.best_estimator_
final_rfc

In [None]:
data_test_final = imputer(data_test)

In [None]:
data_test_final.info()

In [None]:
data_test_final=data_test_final.fillna(method="ffill")

In [None]:
X_data_test, _ = preprocess(data_test_final)

In [None]:
predictions = final_rfc.predict(X_data_test)

In [None]:
final_df = pd.DataFrame(data_test["PassengerId"])
final_df['Survived']=predictions
final_df.to_csv("data/predictions_w_cabin.csv", index=False)

In [None]:
final_df