# Titanic competition - GaussianNB

In this approach I will try to reach great score using simple Mixed GaussianNB algorithm.

`var_smoothing` parameter is adjusted using Optuna

### Imports & settings

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, LeaveOneOut, train_test_split

import optuna

# MAX_DEPTH
MIN_MD = 2
MAX_MD = 1500

# MIN_SAMPLES_SPLIT
MIN_SS = 2
MAX_SS = 100

# MIN_SAMPLES_LEAF
MIN_SL = 1
MAX_SL = 150

# min_weight_fraction_leaf
MIN_WFL = 0
MAX_WFL = 0.3

# max_leaf_nodes
MIN_LN = 2
MAX_LN = 8000

#CCP
MIN_CCP = 0
MAX_CCP = 0.5



N_TRIALS = np.inf
TIMEOUT = 60 * 60 * 6

DROP_LIST = ["PassengerId", "Name"]


### Get train dataset

In [None]:
titanic = pd.read_csv("./data/train.csv")
titanic

## Dataset INFO

In [None]:
titanic.info()

### Drop columns
Which are difficult to process

In [None]:
titanic = titanic.drop(DROP_LIST, axis=1)
titanic

### Prepare data for algorithm
* In `Cabin` leave only firs letter (which means sector in which cabin is placed)
* Adjust labels for `Sex`, `Embarked` and `Cabin` using LabelEncoder
* Fill NaN values in `Age` with mean of this column

In [None]:
def preprocessing(data):
    data["Cabin"] = data.apply(lambda row: ord(str(row["Cabin"])[0])-ord('A') if str(row["Cabin"])[0] != "n" else np.nan, axis=1)
    
    import re
    data["Ticket"] = data.apply(lambda row: re.sub("[^0-9]", "", row["Ticket"]), axis=1)
    data["Ticket_class"] = data.apply(lambda row: int(str(row["Ticket"])[0]) if len(str(row["Ticket"])) > 1 else np.nan, axis=1)
    data["Ticket_class"].fillna(data["Ticket_class"].mean(), inplace=True)
    data.drop(["Ticket"], axis=1, inplace=True)
    
    le = LabelEncoder()
    data["Sex"] = le.fit_transform(data["Sex"])
    data["Embarked"] = le.fit_transform(data["Embarked"])


    data["Age"].fillna(data["Age"].mean(), inplace=True)
    data["Cabin"].fillna(data["Cabin"].mean(), inplace=True)
    data.fillna(data.mean(), inplace=True)

    return data

In [None]:
preprocessing(titanic)
titanic

### Split columns
to create datasets to predict and validate predictions.

`titanic_x` has to be casted to numpy array because of "categorical-feature" parameter of MixedNB.

In [None]:
titanic_x = titanic.drop(["Survived"], axis=1)#.to_numpy()
titanic_y = titanic["Survived"]
titanic_x

Categorical columns are:

In [None]:
model = DecisionTreeClassifier(random_state=0)
acc = []

kf = KFold(n_splits=500, shuffle=True, random_state=1)
for train_index, test_index in kf.split(titanic_x):
    X_train, X_test = titanic_x.iloc[train_index, :], titanic_x.iloc[test_index, :]
    y_train, y_test = titanic_y.iloc[train_index], titanic_y.iloc[test_index]

    model.fit(X_train, y_train)
    acc.append(model.score(X_test, y_test))

print(np.mean(acc))

In [None]:
def objective(trial):
    kf = KFold(n_splits=10, shuffle=True, random_state=8)
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    max_depth = trial.suggest_int("max_depth", MIN_MD, MAX_MD)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", MIN_SL, MAX_SL)
    min_samples_split = trial.suggest_int("min_samples_split", MIN_SS, MAX_SS)
    min_weight_fraction_leaf = trial.suggest_float("min_weight_fraction_leaf", MIN_WFL, MAX_WFL)
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", MIN_LN, MAX_LN)
    ccp_alpha = trial.suggest_float("ccp_alpha", MIN_CCP, MAX_CCP)


    model = DecisionTreeClassifier(criterion=criterion, 
                                   max_depth=max_depth, 
                                   min_samples_leaf=min_samples_leaf, 
                                   min_samples_split=min_samples_split, 
                                   min_weight_fraction_leaf=min_weight_fraction_leaf,
                                   max_leaf_nodes=max_leaf_nodes,
                                   ccp_alpha=ccp_alpha,
                                   random_state=10)
    acc = []


    for train_index, test_index in kf.split(titanic_x):
        X_train, X_test = titanic_x.iloc[train_index, :], titanic_x.iloc[test_index, :]
        y_train, y_test = titanic_y.iloc[train_index], titanic_y.iloc[test_index]

        model.fit(X_train, y_train)
        acc.append(model.score(X_test, y_test))
    return np.mean(acc)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=TIMEOUT, n_trials=N_TRIALS, n_jobs=-1)
print(f"BEST TRIAL: {study.best_trial}")

# optuna.visualization.plot_slice(study).show()

In [None]:
optuna.visualization.plot_slice(study).show()

## Generate test output

In [None]:
# titanic_test = pd.read_csv("./data/test.csv")
# ids = titanic_test["PassengerId"]
# print(f"SHAPE: {titanic_test.shape}")
# titanic_test = titanic_test.drop(DROP_LIST, axis=1)
# preprocessing(titanic_test)
# titanic_test

In [None]:
# model = GaussianNB(var_smoothing=study.best_trial.params["var_smth"])
# model.fit(titanic_x, titanic_y)
# survived = model.predict(titanic_test)
# survived = pd.DataFrame(survived, columns=["Survived"])
# survived["PassengerId"] = ids
# survived.to_csv("./results.csv", index=False)