# Titanic competition - GaussianNB

In this approach I will try to reach great score using simple Mixed GaussianNB algorithm.

`var_smoothing` parameter is adjusted using Optuna

### Imports & settings

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.naive_bayes import GaussianNB
from mixed_naive_bayes import MixedNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, LeaveOneOut, train_test_split

import optuna

MIN_VS = 1e-50
MAX_VS = 1e-4

N_TRIALS = 500

DROP_LIST = ["PassengerId", "Name"]


### Get train dataset

In [None]:
titanic = pd.read_csv("./data/train.csv")
titanic

## Dataset INFO

In [None]:
titanic.info()

### Drop columns
Which are difficult to process

In [None]:
titanic = titanic.drop(DROP_LIST, axis=1)
titanic

### Prepare data for algorithm
* In `Cabin` leave only firs letter (which means sector in which cabin is placed)
* Adjust labels for `Sex`, `Embarked` and `Cabin` using LabelEncoder
* Fill NaN values in `Age` with mean of this column

In [None]:
def preprocessing(data):
    data["Cabin"] = data.apply(lambda row: ord(str(row["Cabin"])[0])-ord('A') if str(row["Cabin"])[0] != "n" else np.nan, axis=1)
    
    import re
    data["Ticket"] = data.apply(lambda row: re.sub("[^0-9]", "", row["Ticket"]), axis=1)
    data["Ticket_class"] = data.apply(lambda row: int(str(row["Ticket"])[0]) if len(str(row["Ticket"])) > 1 else np.nan, axis=1)
    data["Ticket_class"].fillna(data["Ticket_class"].mean(), inplace=True)
    data.drop(["Ticket"], axis=1, inplace=True)
    
    le = LabelEncoder()
    data["Sex"] = le.fit_transform(data["Sex"])
    data["Embarked"] = le.fit_transform(data["Embarked"])


    data["Age"].fillna(data["Age"].mean(), inplace=True)
    data["Cabin"].fillna(data["Cabin"].mean(), inplace=True)
    data.fillna(data.mean(), inplace=True)

    # data.dropna(inplace=True)

    return data

In [None]:
preprocessing(titanic)
titanic

### Split columns
to create datasets to predict and validate predictions.

`titanic_x` has to be casted to numpy array because of "categorical-feature" parameter of MixedNB.

In [None]:
titanic_x = titanic.drop(["Survived"], axis=1)#.to_numpy()
titanic_y = titanic["Survived"]
titanic_x

Categorical columns are:

In [None]:
# model = MixedNB(categorical_features=CATEGORICAL)
# acc = []
#
# kf = KFold(n_splits=500, shuffle=True, random_state=1)
# for train_index, test_index in kf.split(titanic_x):
#     X_train, X_test = titanic_x[train_index, :], titanic_x[test_index, :]
#     y_train, y_test = titanic_y.iloc[train_index], titanic_y.iloc[test_index]
#
#     model.fit(X_train, y_train)
#     acc.append(model.score(X_test, y_test))
#
# print(np.mean(acc))

In [None]:
def objective(trial):
    kf = KFold(n_splits=10, shuffle=True, random_state=8)
    var_smth = trial.suggest_float("var_smth", MIN_VS, MAX_VS, log=True)
    # alpha = trial.suggest_float("alpha",MIN_ALPHA, MAX_ALPHA)
    # model = MixedNB(categorical_features=CATEGORICAL, var_smoothing=var_smth, alpha=alpha)
    model = GaussianNB(var_smoothing=var_smth)
    acc = []


    for train_index, test_index in kf.split(titanic_x):
        X_train, X_test = titanic_x.iloc[train_index, :], titanic_x.iloc[test_index, :]
        y_train, y_test = titanic_y.iloc[train_index], titanic_y.iloc[test_index]

        model.fit(X_train, y_train)
        acc.append(model.score(X_test, y_test))
    return np.mean(acc)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=N_TRIALS, n_jobs=-1)
print(f"BEST TRIAL: {study.best_trial}")

# optuna.visualization.plot_slice(study).show()

In [None]:
optuna.visualization.plot_slice(study).show()

## Generate test output

In [None]:
titanic_test = pd.read_csv("./data/test.csv")
ids = titanic_test["PassengerId"]
print(f"SHAPE: {titanic_test.shape}")
titanic_test = titanic_test.drop(DROP_LIST, axis=1)
preprocessing(titanic_test)
titanic_test

In [15]:
model = GaussianNB(var_smoothing=study.best_trial.params["var_smth"])
model.fit(titanic_x, titanic_y)
survived = model.predict(titanic_test)
survived = pd.DataFrame(survived, columns=["Survived"])
survived["PassengerId"] = ids
survived.to_csv("./results.csv", index=False)