In [45]:
import numpy as np
import pandas as pd
import matplotlib
import os
from plotly.matplotlylib.mplexporter.utils import PATH_DICT

from sklearn.preprocessing import OneHotEncoder


from sklearn.ensemble import RandomForestClassifier
import sklearn
import pickle


In [22]:
def process_df(df: pd.DataFrame) -> pd.DataFrame:
    # Drop useless features
    df.drop(columns=["Name","Ticket"], inplace=True)
    
    # Deal with cabin
    df["Deck"] = df["Cabin"].str.extract(r'([A-Za-z]+)')
    df["Deck_Number"] = df["Cabin"].str.extract(r'(\d+)')
    df.drop("Cabin", axis=1, inplace=True)

    # Deal with sex
    sex_map = {"male": 0, "female": 1}
    df["Sex"] = df["Sex"].map(sex_map)

    # Deal with N/A
    df["Deck"] = df["Deck"].fillna("N/A")
    df["Embarked"] = df["Embarked"].fillna("N/A")

    # One-hot encode exp and port of embarkation
    categorical_columns = ["Embarked", "Deck"]
    encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = encoder.fit_transform(df[categorical_columns])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
    df_encoded = pd.concat([df, one_hot_df], axis=1)
    df_encoded = df_encoded.drop(categorical_columns, axis=1)

    # Set index
    df_encoded.set_index("PassengerId", inplace=True)
    return df_encoded


In [24]:
pd.read_csv("./inputs/train.csv").head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [25]:
# Import DF
train_val_df : pd.DataFrame = pd.read_csv("./inputs/train.csv")
test_df : pd.DataFrame = pd.read_csv("./inputs/test.csv")

index = train_val_df.shape[0]
total = process_df(pd.concat([train_val_df, test_df], ignore_index=True))

train_val_df = total.iloc[:index]
test_df = total.iloc[index:]


In [26]:
def test_random_forest(x, y, x_val, y_val, **kwargs):
    model = get_model(x,y,**kwargs)
    pred = model.predict(x_val)
    return compare(pred, y_val)

def get_model(x,y, **kwargs):
    model = RandomForestClassifier(**kwargs)
    model.fit(x, y)
    return model

def compare(pred, y):
    return np.count_nonzero(y.to_numpy() == pred) / len(y)

In [42]:
# Finding the best hyperparameters by averaging performance based on cross validation
FILE_LOC = "./saved/calculated.pkl"

n_estimators_s = range(25,200,25)
max_depth_s = range(1,15)

results = pickle.load(open(FILE_LOC, 'rb')) if os.path.exists(FILE_LOC) else {}

kf = sklearn.model_selection.KFold(random_state=42, n_splits=5, shuffle=True)

for n_estimators in n_estimators_s:
    for max_depth in max_depth_s:
        if results.get((n_estimators, max_depth)) is not None:
            print(f"Skipping {n_estimators} {max_depth} {results.get((n_estimators, max_depth))}")
            continue

        average = []
        for train_index, val_index in kf.split(train_val_df):
            train_df = train_val_df.iloc[train_index]
            val_df = train_val_df.iloc[val_index]

            train_y = train_df["Survived"]
            train_x = train_df.drop("Survived", axis=1)

            val_y = val_df["Survived"]
            val_x = val_df.drop("Survived", axis=1)

            average.append(test_random_forest(train_x, train_y, val_x, val_y, n_estimators=n_estimators, random_state=1, max_depth=max_depth))
        print(n_estimators, max_depth, np.mean(average), sep="\t")
        results[(n_estimators, max_depth)] = np.mean(average)

with open(FILE_LOC, 'wb') as file:
    pickle.dump(results, file)

25	1	0.7082041303119703
25	2	0.7800326407632918
25	3	0.7878915322327538
25	4	0.8103446111355218
25	5	0.8159374803841567
25	6	0.8204318624066286
25	7	0.8193208210407382
25	8	0.8193082669010107
25	9	0.8148076078086749
25	10	0.8103383340656581
25	11	0.8114556525014125
25	12	0.8204193082669009
25	13	0.8092210156299041
25	14	0.8069738246186681
50	1	0.7250455087565124
50	2	0.7642897495449124
50	3	0.7890276818780994
50	4	0.81146820664114
50	5	0.8226727763480008
50	6	0.824913690289373
50	7	0.8215554579122466
50	8	0.8193082669010107
50	9	0.8215491808423827
50	10	0.8182097796748478
50	11	0.8305442219571904
50	12	0.8260372857949909
50	13	0.8204695248258111
50	14	0.819327098110602
75	1	0.7025861527838805
75	2	0.7564371351453142
75	3	0.784514468646036
75	4	0.8035904839620865
75	5	0.8204318624066286
75	6	0.8260372857949909
75	7	0.8204444165463561
75	8	0.8182035026049841
75	9	0.8181846713953927
75	10	0.8215805661917017
75	11	0.827185989580064
75	12	0.8271608813006089
75	13	0.8215868432615656
75	14	0.

In [43]:
data = np.zeros((len(n_estimators_s), len(max_depth_s)))

for i, n_estimators in enumerate(n_estimators_s):
    for j, max_depth in enumerate(max_depth_s):
        data[i, j] = results[(n_estimators, max_depth)]

z_data = pd.DataFrame(data)

import plotly.graph_objects as go

fig = go.Figure(data=[go.Surface(x=list(max_depth_s), y=list(n_estimators_s), z=z_data)])
fig.update_traces(contours_z=dict(show=True, usecolormap=True,
                                  highlightcolor="limegreen", project_z=True))
fig.update_layout(title=dict(text='Hyperparameter tuning graph'), autosize=False,
                  scene_camera_eye=dict(x=1.87, y=0.88, z=-0.64),
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90)
)

fig.show()

In [46]:
train_df = train_val_df #.sample(frac=0.8, random_state=1)

train_y = train_df["Survived"]
train_x = train_df.drop("Survived", axis=1)

model = get_model(train_x, train_y, n_estimators=500, random_state=1, max_depth=11)
predictions = model.predict(test_df.drop("Survived", axis=1))

output = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions.astype(int)})
output.to_csv('submission.csv', index=False)

print("Your submission was successfully saved!")

Your submission was successfully saved!
