# Titanic competition - GaussianNB

In this approach I will try to reach great score using simple Mixed GaussianNB algorithm.

`var_smoothing` parameter is adjusted using Optuna

### Imports & settings

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from optuna import create_study

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, LeaveOneOut, train_test_split

import optuna

# N_ESTIMATORS
MIN_EST = 1
MAX_EST = 500

# MAX_DEPTH
MIN_MD = 2
MAX_MD = 600

# MIN_SAMPLES_SPLIT
MIN_SS = 2
MAX_SS = 100

# MIN_SAMPLES_LEAF
MIN_SL = 1
MAX_SL = 150

# min_weight_fraction_leaf
MIN_WFL = 0
MAX_WFL = 0.4

# max_leaf_nodes
MIN_LN = 2
MAX_LN = 8000

#CCP
MIN_CCP = 0
MAX_CCP = 0.5

N_TRIALS = np.inf
N_SMALL_TRIALS = 30
TIMEOUT = 60 * 60

DROP_LIST = ["PassengerId", "Name"]


### Get train dataset

In [2]:
titanic = pd.read_csv("./data/train.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Dataset INFO

In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Drop columns
Which are difficult to process

In [4]:
titanic = titanic.drop(DROP_LIST, axis=1)
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,1,1,female,35.0,1,0,113803,53.1000,C123,S
4,0,3,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,211536,13.0000,,S
887,1,1,female,19.0,0,0,112053,30.0000,B42,S
888,0,3,female,,1,2,W./C. 6607,23.4500,,S
889,1,1,male,26.0,0,0,111369,30.0000,C148,C


### Prepare data for algorithm
* In `Cabin` leave only firs letter (which means sector in which cabin is placed)
* Adjust labels for `Sex`, `Embarked` and `Cabin` using LabelEncoder
* Fill NaN values in `Age` with mean of this column

In [5]:
def preprocessing(data):
    data["Cabin"] = data.apply(
        lambda row: ord(str(row["Cabin"])[0]) - ord('A') if str(row["Cabin"])[0] != "n" else np.nan, axis=1)

    import re
    data["Ticket"] = data.apply(lambda row: re.sub("[^0-9]", "", row["Ticket"]), axis=1)
    data["Ticket_class"] = data.apply(lambda row: int(str(row["Ticket"])[0]) if len(str(row["Ticket"])) > 1 else np.nan,
                                      axis=1)
    data["Ticket_class"].fillna(data["Ticket_class"].mean(), inplace=True)
    data.drop(["Ticket"], axis=1, inplace=True)

    le = LabelEncoder()
    data["Sex"] = le.fit_transform(data["Sex"])
    data["Embarked"] = le.fit_transform(data["Embarked"])

    data["Age"].fillna(data["Age"].mean(), inplace=True)
    data["Cabin"].fillna(data["Cabin"].mean(), inplace=True)
    data.fillna(data.mean(), inplace=True)

    return data

In [6]:
preprocessing(titanic)
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_class
0,0,3,1,22.000000,1,0,7.2500,2.45098,2,5.0
1,1,1,0,38.000000,1,0,71.2833,2.00000,0,1.0
2,1,3,0,26.000000,0,0,7.9250,2.45098,2,2.0
3,1,1,0,35.000000,1,0,53.1000,2.00000,2,1.0
4,0,3,1,35.000000,0,0,8.0500,2.45098,2,3.0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,2.45098,2,2.0
887,1,1,0,19.000000,0,0,30.0000,1.00000,2,1.0
888,0,3,0,29.699118,1,2,23.4500,2.45098,2,6.0
889,1,1,1,26.000000,0,0,30.0000,2.00000,0,1.0


### Split columns
to create datasets to predict and validate predictions.

`titanic_x` has to be casted to numpy array because of "categorical-feature" parameter of MixedNB.

In [7]:
titanic_x = titanic.drop(["Survived"], axis=1)  #.to_numpy()
titanic_y = titanic["Survived"]
titanic_x

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_class
0,3,1,22.000000,1,0,7.2500,2.45098,2,5.0
1,1,0,38.000000,1,0,71.2833,2.00000,0,1.0
2,3,0,26.000000,0,0,7.9250,2.45098,2,2.0
3,1,0,35.000000,1,0,53.1000,2.00000,2,1.0
4,3,1,35.000000,0,0,8.0500,2.45098,2,3.0
...,...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,2.45098,2,2.0
887,1,0,19.000000,0,0,30.0000,1.00000,2,1.0
888,3,0,29.699118,1,2,23.4500,2.45098,2,6.0
889,1,1,26.000000,0,0,30.0000,2.00000,0,1.0


Categorical columns are:

In [8]:
# model = DecisionTreeClassifier(random_state=0)
# acc = []
#
# kf = KFold(n_splits=500, shuffle=True, random_state=1)
# for train_index, test_index in kf.split(titanic_x):
#     X_train, X_test = titanic_x.iloc[train_index, :], titanic_x.iloc[test_index, :]
#     y_train, y_test = titanic_y.iloc[train_index], titanic_y.iloc[test_index]
#
#     model.fit(X_train, y_train)
#     acc.append(model.score(X_test, y_test))
#
# print(np.mean(acc))

In [None]:
def objective(trial, n_est, max_depth):
    kf = KFold(n_splits=10, shuffle=True, random_state=8)
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])

    min_samples_leaf = trial.suggest_int("min_samples_leaf", MIN_SL, MAX_SL)
    min_samples_split = trial.suggest_int("min_samples_split", MIN_SS, MAX_SS)
    min_weight_fraction_leaf = trial.suggest_float("min_weight_fraction_leaf", MIN_WFL, MAX_WFL)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", MIN_LN, MAX_LN)
    ccp_alpha = trial.suggest_float("ccp_alpha", MIN_CCP, MAX_CCP)

    model = RandomForestClassifier(n_estimators=n_est,
                                   criterion=criterion,
                                   max_depth=max_depth,
                                   min_samples_leaf=min_samples_leaf,
                                   min_samples_split=min_samples_split,
                                   min_weight_fraction_leaf=min_weight_fraction_leaf,
                                   max_leaf_nodes=max_leaf_nodes,
                                   ccp_alpha=ccp_alpha,
                                   random_state=10,
                                   n_jobs=8)
    acc = []

    for train_index, test_index in kf.split(titanic_x):
        X_train, X_test = titanic_x.iloc[train_index, :], titanic_x.iloc[test_index, :]
        y_train, y_test = titanic_y.iloc[train_index], titanic_y.iloc[test_index]

        model.fit(X_train, y_train)
        acc.append(model.score(X_test, y_test))
    return np.mean(acc)

best_val = 0
best_t = 0
def objective_n_est(trial):
    n_est = trial.suggest_int("n_estimators", MIN_EST, MAX_EST)
    max_depth = trial.suggest_int("max_depth", MIN_MD, MAX_MD)

    study_small = create_study(direction="maximize")
    study_small.optimize(lambda trial: objective(trial, n_est, max_depth), n_trials=N_SMALL_TRIALS)

    global best_val, best_t
    if study_small.best_trial.values[0] > best_val:
        best_val = study_small.best_trial.values[0]
        best_t = study_small.best_trial
    
    print(f"GLOBAL #{trial.number}; VAL: {study_small.best_trial.values[0]}")
    
    return study_small.best_trial.values[0]


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_n_est, n_trials=N_TRIALS, timeout=TIMEOUT, n_jobs=-1)
print(f"BEST TRIAL: {study.best_trial}")

# optuna.visualization.plot_slice(study).show()

In [None]:
optuna.visualization.plot_slice(study).show()

In [None]:
print(study.best_trial)
print(best_t)

## Generate test output

In [None]:
titanic_test = pd.read_csv("./data/test.csv")
ids = titanic_test["PassengerId"]
print(f"SHAPE: {titanic_test.shape}")
titanic_test = titanic_test.drop(DROP_LIST, axis=1)
preprocessing(titanic_test)
titanic_test

In [None]:
model = RandomForestClassifier(n_estimators=study.best_trial.params["n_estimators"],
                               criterion=best_t.params["criterion"],
                               max_depth=study.best_trial.params["max_depth"],
                               min_samples_leaf=best_t.params["min_samples_leaf"],
                               min_samples_split=best_t.params["min_samples_split"],
                               min_weight_fraction_leaf=best_t.params["min_weight_fraction_leaf"],
                               max_leaf_nodes=best_t.params["max_leaf_nodes"],
                               ccp_alpha=best_t.params["ccp_alpha"],
                               random_state=10)
model.fit(titanic_x, titanic_y)
survived = model.predict(titanic_test)
survived = pd.DataFrame(survived, columns=["Survived"])
survived["PassengerId"] = ids
survived.to_csv("./results.csv", index=False)

In [19]:
model = RandomForestClassifier(n_estimators=study.best_trial.params["n_estimators"],
                               criterion=best_t.params["criterion"],
                               max_depth=study.best_trial.params["max_depth"],
                               min_samples_leaf=best_t.params["min_samples_leaf"],
                               min_samples_split=best_t.params["min_samples_split"],
                               min_weight_fraction_leaf=best_t.params["min_weight_fraction_leaf"],
                               max_leaf_nodes=best_t.params["max_leaf_nodes"],
                               ccp_alpha=best_t.params["ccp_alpha"],
                               random_state=10)
model.fit(titanic_x, titanic_y)
survived = model.predict(titanic_test)
survived = pd.DataFrame(survived, columns=["Survived"])
survived["PassengerId"] = ids
survived.to_csv("./results.csv", index=False)