# Titanic competition - GaussianNB

In this approach I will try to reach great score using simple Mixed GaussianNB algorithm.

`var_smoothing` parameter is adjusted using Optuna

### Imports & settings

In [174]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.naive_bayes import GaussianNB
from mixed_naive_bayes import MixedNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, LeaveOneOut

import optuna

MIN_VS = 1e-50
MAX_VS = 1e-4

MIN_ALPHA = 0
MAX_ALPHA = 1

N_TRIALS = 100


### Get train dataset

In [175]:
titanic = pd.read_csv("./data/train.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Dataset INFO

In [176]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Drop columns
Which are difficult to process

In [177]:
drop_list = ["PassengerId", "Name", "Ticket"]
titanic = titanic.drop(drop_list, axis=1)
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


### Prepare data for algorithm
* In `Cabin` leave only firs letter (which means sector in which cabin is placed)
* Adjust labels for `Sex`, `Embarked` and `Cabin` using LabelEncoder
* Fill NaN values in `Age` with mean of this column

In [178]:
titanic["Cabin"] = titanic.apply(lambda row: ord(str(row["Cabin"])[0])-ord('A') if str(row["Cabin"])[0] != "n" else np.nan, axis=1)

le = LabelEncoder()
titanic["Sex"] = le.fit_transform(titanic["Sex"])
titanic["Embarked"] = le.fit_transform(titanic["Embarked"])


titanic["Age"].fillna(titanic["Age"].mean(), inplace=True)
titanic["Cabin"].fillna(titanic["Cabin"].mean(), inplace=True)

titanic

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.000000,1,0,7.2500,2.45098,2
1,1,1,0,38.000000,1,0,71.2833,2.00000,0
2,1,3,0,26.000000,0,0,7.9250,2.45098,2
3,1,1,0,35.000000,1,0,53.1000,2.00000,2
4,0,3,1,35.000000,0,0,8.0500,2.45098,2
...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,2.45098,2
887,1,1,0,19.000000,0,0,30.0000,1.00000,2
888,0,3,0,29.699118,1,2,23.4500,2.45098,2
889,1,1,1,26.000000,0,0,30.0000,2.00000,0


### Split columns
to create datasets to predict and validate predictions.

`titanic_x` has to be casted to numpy array because of "categorical-feature" parameter of MixedNB.

In [179]:
titanic_x = titanic.drop(["Survived"], axis=1)#.to_numpy()
titanic_y = titanic["Survived"]
titanic_x

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.000000,1,0,7.2500,2.45098,2
1,1,0,38.000000,1,0,71.2833,2.00000,0
2,3,0,26.000000,0,0,7.9250,2.45098,2
3,1,0,35.000000,1,0,53.1000,2.00000,2
4,3,1,35.000000,0,0,8.0500,2.45098,2
...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,2.45098,2
887,1,0,19.000000,0,0,30.0000,1.00000,2
888,3,0,29.699118,1,2,23.4500,2.45098,2
889,1,1,26.000000,0,0,30.0000,2.00000,0


Categorical columns are:

In [129]:
CATEGORICAL = [1, 6, 7]

In [130]:
# model = MixedNB(categorical_features=CATEGORICAL)
# acc = []
#
# kf = KFold(n_splits=500, shuffle=True, random_state=1)
# for train_index, test_index in kf.split(titanic_x):
#     X_train, X_test = titanic_x[train_index, :], titanic_x[test_index, :]
#     y_train, y_test = titanic_y.iloc[train_index], titanic_y.iloc[test_index]
#
#     model.fit(X_train, y_train)
#     acc.append(model.score(X_test, y_test))
#
# print(np.mean(acc))

In [180]:
def objective(trial):
    kf = KFold(n_splits=10, shuffle=True, random_state=10)
    var_smth = trial.suggest_float("var_smth", MIN_VS, MAX_VS, log=True)
    # alpha = trial.suggest_float("alpha",MIN_ALPHA, MAX_ALPHA)
    # model = MixedNB(categorical_features=CATEGORICAL, var_smoothing=var_smth, alpha=alpha)
    model = GaussianNB(var_smoothing=var_smth)
    acc = []


    for train_index, test_index in kf.split(titanic_x):
        X_train, X_test = titanic_x.iloc[train_index, :], titanic_x.iloc[test_index, :]
        y_train, y_test = titanic_y.iloc[train_index], titanic_y.iloc[test_index]

        model.fit(X_train, y_train)
        acc.append(model.score(X_test, y_test))
    return np.mean(acc)

In [181]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=N_TRIALS, n_jobs=-1)
print(f"BEST TRIAL: {study.best_trial}")

# optuna.visualization.plot_slice(study).show()

[32m[I 2022-05-18 00:38:51,461][0m A new study created in memory with name: no-name-ddd16656-e871-468b-bb93-44ecc95f41c6[0m

`n_jobs` argument has been deprecated in v2.7.0. This feature will be removed in v4.0.0. See https://github.com/optuna/optuna/releases/tag/v2.7.0.

[32m[I 2022-05-18 00:38:53,033][0m Trial 2 finished with value: 0.7889887640449438 and parameters: {'var_smth': 2.1332748781324394e-44}. Best is trial 2 with value: 0.7889887640449438.[0m
[32m[I 2022-05-18 00:38:53,093][0m Trial 1 finished with value: 0.7844694132334582 and parameters: {'var_smth': 1.7154879764145393e-05}. Best is trial 2 with value: 0.7889887640449438.[0m
[32m[I 2022-05-18 00:38:53,203][0m Trial 8 finished with value: 0.7889887640449438 and parameters: {'var_smth': 1.3104356498883945e-48}. Best is trial 2 with value: 0.7889887640449438.[0m
[32m[I 2022-05-18 00:38:53,252][0m Trial 6 finished with value: 0.7889887640449438 and parameters: {'var_smth': 1.7956224886361652e-17}. Best is tria

BEST TRIAL: FrozenTrial(number=2, values=[0.7889887640449438], datetime_start=datetime.datetime(2022, 5, 18, 0, 38, 51, 479863), datetime_complete=datetime.datetime(2022, 5, 18, 0, 38, 53, 32734), params={'var_smth': 2.1332748781324394e-44}, distributions={'var_smth': LogUniformDistribution(high=0.0001, low=1e-50)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=2, state=TrialState.COMPLETE, value=None)


In [182]:
optuna.visualization.plot_slice(study).show()

## Generate test output

In [None]:
titanic = pd.read_csv("./data/test.csv")