<a href="https://colab.research.google.com/github/bluepsm/kaggle-titanic-survival-prediction/blob/main/titanic_survival_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

!pip install ydf
import ydf

from sklearn.model_selection import cross_val_score

from ast import literal_eval

Collecting ydf
  Downloading ydf-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ydf
Successfully installed ydf-0.5.0


# Initializing and Set Options

In [None]:
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)

random_seed = 42

# Load Dataset

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/datasets/titanic/train.csv',
                      index_col='PassengerId')
pred_dataset = pd.read_csv('/content/drive/MyDrive/datasets/titanic/test.csv',
                           index_col='PassengerId')

# Data Pre-Processing


## Convert to Categorical and put in Equal Frequently Bucket.

In [None]:
def data_preprocessing_cat_freq_bin(dataset):
  dataset = dataset.copy()

  age_mean = dataset['Age'].mean()
  age_std = dataset['Age'].std()
  age_is_null = dataset['Age'].isnull().sum()
  rand_age = np.random.randint(age_mean - age_std, age_mean + age_std, size = age_is_null)
  age_slice = dataset['Age'].copy()
  age_slice[np.isnan(age_slice)] = rand_age
  dataset['Age'] = age_slice
  dataset['Age'] = dataset['Age'].astype(int)

  dataset.loc[dataset['Age'] <= 18, 'AgeGroup'] = '<18'
  dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'AgeGroup'] = '18-22'
  dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 28), 'AgeGroup'] = '22-28'
  dataset.loc[(dataset['Age'] > 28) & (dataset['Age'] <= 33), 'AgeGroup'] = '28-33'
  dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 41.5), 'AgeGroup'] = '33-42'
  dataset.loc[dataset['Age'] > 41.5, 'AgeGroup'] = '>42'

  dataset['Fare'].fillna(dataset['Fare'].mean(), inplace=True)
  dataset.loc[dataset['Fare'] <= 7.775, 'FareGroup'] = '<7.77'
  dataset.loc[(dataset['Fare'] > 7.775) & (dataset['Fare'] <= 8.662), 'FareGroup'] = '7.77-8.66'
  dataset.loc[(dataset['Fare'] > 8.662) & (dataset['Fare'] <= 14.454), 'FareGroup'] = '8.66-14.45'
  dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 26), 'FareGroup'] = '14.45-26.00'
  dataset.loc[(dataset['Fare'] > 26) & (dataset['Fare'] <= 52.554), 'FareGroup'] = '26.00-52.55'
  dataset.loc[dataset['Fare'] > 52.554, 'FareGroup'] = '>52.55'

  dataset['Relatives'] = dataset['SibSp'] + dataset['Parch']

  dataset.loc[dataset['Relatives'] > 0, 'Alone'] = 'No'
  dataset.loc[dataset['Relatives'] == 0, 'Alone'] = 'Yes'

  dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)

  dataset['Deck'] = dataset['Cabin'].str.slice(0, 1)
  dataset['Deck'].fillna('Unknown', inplace=True)

  dataset['Title']  = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
  dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt',
                                               'Col', 'Don', 'Dr', 'Major',
                                               'Rev', 'Sir', 'Jonkheer',
                                               'Dona', 'Master'], 'Rare')
  dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
  dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
  dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

  dataset.drop(['Name', 'Cabin', 'Ticket', 'SibSp', 'Parch', 'Age', 'Fare'], axis=1, inplace=True)

  return dataset

In [None]:
cat_dataset = data_preprocessing_cat_freq_bin(dataset)
pred_cat_dataset = data_preprocessing_cat_freq_bin(pred_dataset)

cat_dataset

Unnamed: 0_level_0,Survived,Pclass,Sex,Embarked,AgeGroup,FareGroup,Relatives,Alone,Deck,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,male,S,18-22,<7.77,1,No,Unknown,Mr
2,1,1,female,C,33-42,>52.55,1,No,C,Mrs
3,1,3,female,S,22-28,7.77-8.66,0,Yes,Unknown,Miss
4,1,1,female,S,33-42,>52.55,1,No,C,Mrs
5,0,3,male,S,33-42,7.77-8.66,0,Yes,Unknown,Mr
...,...,...,...,...,...,...,...,...,...,...
887,0,2,male,S,22-28,8.66-14.45,0,Yes,Unknown,Rare
888,1,1,female,S,18-22,26.00-52.55,0,Yes,B,Miss
889,0,3,female,S,22-28,14.45-26.00,3,No,Unknown,Miss
890,1,1,male,C,22-28,26.00-52.55,0,Yes,C,Mr


## Convert to Numerical and put in Equal Frequently Bucket.

In [None]:
def data_preprocessing_numeric_freq_bin(dataset):
  dataset = dataset.copy()

  age_mean = dataset['Age'].mean()
  age_std = dataset['Age'].std()
  age_is_null = dataset['Age'].isnull().sum()
  rand_age = np.random.randint(age_mean - age_std, age_mean + age_std, size = age_is_null)
  age_slice = dataset['Age'].copy()
  age_slice[np.isnan(age_slice)] = rand_age
  dataset['Age'] = age_slice
  dataset['Age'] = dataset['Age'].astype(int)

  dataset.loc[dataset['Age'] <= 18, 'AgeGroup'] = 0
  dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'AgeGroup'] = 1
  dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 28), 'AgeGroup'] = 2
  dataset.loc[(dataset['Age'] > 28) & (dataset['Age'] <= 33), 'AgeGroup'] = 3
  dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 41.5), 'AgeGroup'] = 4
  dataset.loc[dataset['Age'] > 41.5, 'AgeGroup'] = 5
  dataset['AgeGroup'] = dataset['AgeGroup'].astype(int)

  dataset['Fare'].fillna(dataset['Fare'].mean(), inplace=True)
  dataset.loc[dataset['Fare'] <= 7.775, 'FareGroup'] = 0
  dataset.loc[(dataset['Fare'] > 7.775) & (dataset['Fare'] <= 8.662), 'FareGroup'] = 1
  dataset.loc[(dataset['Fare'] > 8.662) & (dataset['Fare'] <= 14.454), 'FareGroup'] = 2
  dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 26), 'FareGroup'] = 3
  dataset.loc[(dataset['Fare'] > 26) & (dataset['Fare'] <= 52.554), 'FareGroup'] = 4
  dataset.loc[dataset['Fare'] > 52.554, 'FareGroup'] = 5
  dataset['FareGroup'] = dataset['FareGroup'].astype(int)

  dataset['Relatives'] = dataset['SibSp'] + dataset['Parch']

  dataset.loc[dataset['Relatives'] > 0, 'Alone'] = 0
  dataset.loc[dataset['Relatives'] == 0, 'Alone'] = 1
  dataset['Alone'] = dataset['Alone'].astype(int)

  dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
  ports = {"S": 0, "C": 1, "Q": 2}
  dataset['Embarked'] = dataset['Embarked'].map(ports)
  dataset['Embarked'] = dataset['Embarked'].astype(int)

  dataset['Deck'] = dataset['Cabin'].str.slice(0, 1)
  deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
  dataset['Deck'] = dataset['Deck'].map(deck)
  dataset['Deck'] = dataset['Deck'].fillna(0)
  dataset['Deck'] = dataset['Deck'].astype(int)

  dataset['Title']  = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
  dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt',
                                               'Col', 'Don', 'Dr', 'Major',
                                               'Rev', 'Sir', 'Jonkheer',
                                               'Dona', 'Master'], 'Rare')
  dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
  dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
  dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
  title_dataset = dataset.Title.copy()
  title = {"Mr": 0, "Miss": 1, "Mrs": 2, "Rare": 3}
  title_dataset = title_dataset.map(title)
  dataset['Title'] = title_dataset
  dataset['Title'] = dataset['Title'].astype(int)

  genders = {"male": 0, "female": 1}
  dataset['Sex'] = dataset['Sex'].map(genders)
  dataset['Sex'] = dataset['Sex'].astype(int)

  dataset.drop(['Name', 'Cabin', 'Ticket', 'SibSp', 'Parch', 'Age', 'Fare'], axis=1, inplace=True)

  return dataset

In [None]:
num_dataset = data_preprocessing_numeric_freq_bin(dataset)
pred_num_dataset = data_preprocessing_numeric_freq_bin(pred_dataset)

num_dataset

Unnamed: 0_level_0,Survived,Pclass,Sex,Embarked,AgeGroup,FareGroup,Relatives,Alone,Deck,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,3,0,0,1,0,1,0,0,0
2,1,1,1,1,4,5,1,0,3,2
3,1,3,1,0,2,1,0,1,0,1
4,1,1,1,0,4,5,1,0,3,2
5,0,3,0,0,4,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
887,0,2,0,0,2,2,0,1,0,3
888,1,1,1,0,1,4,0,1,2,1
889,0,3,1,0,0,3,3,0,0,1
890,1,1,0,1,2,4,0,1,3,0


# Define Model

## Simple Random Forest Model

In [None]:
simple_rf = ydf.RandomForestLearner(
    label='Survived',
    compute_oob_performances=True
)

###Train on Catagorical Dataset

In [None]:
example = pd.DataFrame(cat_dataset.iloc[-1]).transpose()
example

Unnamed: 0,Survived,Pclass,Sex,Embarked,AgeGroup,FareGroup,Relatives,Alone,Deck,Title
891,0,3,male,Q,28-33,<7.77,0,Yes,Unknown,Mr


In [None]:
simple_rf_cat = simple_rf.train(cat_dataset)

simple_rf_cat.describe()

Train model on 891 examples
Model trained in 0:00:00.626685


###Train on Numeric Dataset

In [None]:
simple_rf_num = simple_rf.train(num_dataset)

simple_rf_num.describe()

Train model on 891 examples
Model trained in 0:00:00.850806


## Simple Random Forest Model with specific features

In [None]:
simple_rf_custom = ydf.RandomForestLearner(
    label='Survived',
    features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
              ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
              ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
              # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
    ],
)

###Train on Catagorical Dataset

In [None]:
simple_rf_custom_cat = simple_rf_custom.train(cat_dataset)

simple_rf_custom_cat.describe()

Train model on 891 examples
Model trained in 0:00:00.985779


###Train on Numeric Dataset

In [None]:
simple_rf_custom_num = simple_rf_custom.train(num_dataset)

simple_rf_custom_num.describe()

Train model on 891 examples
Model trained in 0:00:00.870440


## Simple Random Forest Model with specific features and use Tuner

In [None]:
tuner = ydf.RandomSearchTuner(num_trials=50, automatic_search_space=True)

simple_rf_custom_tuner = ydf.RandomForestLearner(
    label='Survived',
    features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
              ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
              ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
              # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
    ],
    tuner=tuner,
)

###Train on Catagorical Dataset

In [None]:
simple_rf_custom_tuner_cat = simple_rf_custom_tuner.train(cat_dataset)

best_trial_simple_rf_custom_tuner_cat = pd.DataFrame(
    simple_rf_custom_tuner_cat.hyperparameter_optimizer_logs().trials
    ).sort_values(by=['score'], ascending=[False]).set_index(['score'])

best_params_simple_rf_custom_tuner_cat = best_trial_simple_rf_custom_tuner_cat.iloc[0].params

# best_params_simple_rf_custom_tuner_cat = {k: literal_eval(str(v)) for k, v in best_params_simple_rf_custom_tuner_cat.items()}
for k, v in best_params_simple_rf_custom_tuner_cat.items():
  if v == 'true':
    best_params_simple_rf_custom_tuner_cat[k] = True
  elif v == 'false':
    best_params_simple_rf_custom_tuner_cat[k] = False

simple_rf_custom_tuner_cat.describe()

Train model on 891 examples
Model trained in 0:00:40.010777


trial,score,duration,split_axis,sparse_oblique_projection_density_factor,sparse_oblique_normalization,sparse_oblique_weights,categorical_algorithm,winner_take_all,max_depth,min_examples
20,0.830527,19.5094,SPARSE_OBLIQUE,2.0,NONE,BINARY,CART,True,30,10
4,0.830527,4.28772,SPARSE_OBLIQUE,1.0,MIN_MAX,CONTINUOUS,CART,True,20,10
5,0.830527,4.80895,SPARSE_OBLIQUE,2.0,NONE,BINARY,CART,True,20,10
38,0.830527,35.0553,SPARSE_OBLIQUE,5.0,MIN_MAX,CONTINUOUS,CART,True,16,10
46,0.830527,38.8418,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,CART,True,25,10
45,0.830527,38.6002,SPARSE_OBLIQUE,3.0,STANDARD_DEVIATION,BINARY,CART,True,12,5
44,0.830527,38.3287,SPARSE_OBLIQUE,4.0,MIN_MAX,BINARY,CART,True,12,5
27,0.829405,24.7819,SPARSE_OBLIQUE,2.0,MIN_MAX,BINARY,CART,True,25,5
36,0.828283,34.1984,SPARSE_OBLIQUE,3.0,STANDARD_DEVIATION,BINARY,RANDOM,True,16,5
10,0.828283,10.244,SPARSE_OBLIQUE,4.0,STANDARD_DEVIATION,CONTINUOUS,RANDOM,True,16,5


###Train on Numeric Dataset

In [None]:
simple_rf_custom_tuner_num = simple_rf_custom_tuner.train(num_dataset)

best_trial_simple_rf_custom_tuner_num = pd.DataFrame(
    simple_rf_custom_tuner_num.hyperparameter_optimizer_logs().trials
    ).sort_values(by=['score'], ascending=[False]).set_index(['score'])

best_params_simple_rf_custom_tuner_num = best_trial_simple_rf_custom_tuner_num.iloc[0].params

# best_params_simple_rf_custom_tuner_num = {k: literal_eval(str(v)) for k, v in best_params_simple_rf_custom_tuner_num.items()}
for k, v in best_params_simple_rf_custom_tuner_num.items():
  if v == 'true':
    best_params_simple_rf_custom_tuner_num[k] = True
  elif v == 'false':
    best_params_simple_rf_custom_tuner_num[k] = False

simple_rf_custom_tuner_num.describe()

Train model on 891 examples
Model trained in 0:00:23.691550


trial,score,duration,split_axis,sparse_oblique_projection_density_factor,sparse_oblique_normalization,sparse_oblique_weights,categorical_algorithm,winner_take_all,max_depth,min_examples
2,0.838384,0.99841,SPARSE_OBLIQUE,4.0,MIN_MAX,CONTINUOUS,RANDOM,True,16,10
49,0.837261,23.673,SPARSE_OBLIQUE,3.0,STANDARD_DEVIATION,BINARY,RANDOM,True,25,10
14,0.837261,6.03437,SPARSE_OBLIQUE,3.0,STANDARD_DEVIATION,BINARY,RANDOM,True,30,10
6,0.837261,2.3384,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,RANDOM,True,20,10
12,0.837261,5.3201,SPARSE_OBLIQUE,2.0,MIN_MAX,BINARY,RANDOM,True,30,10
48,0.836139,22.8779,SPARSE_OBLIQUE,5.0,NONE,BINARY,RANDOM,True,12,10
4,0.833894,1.62904,SPARSE_OBLIQUE,1.0,MIN_MAX,CONTINUOUS,CART,True,20,10
20,0.832772,9.58301,SPARSE_OBLIQUE,2.0,NONE,BINARY,CART,True,30,10
46,0.832772,21.6592,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,CART,True,25,10
5,0.832772,1.87296,SPARSE_OBLIQUE,2.0,NONE,BINARY,CART,True,20,10


## Random Forest Model with specific features and Tuning

In [None]:
# tuned_rf = ydf.RandomForestLearner(
#     label='Survived',
#     features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
#               # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
#     ],
#     split_axis='SPARSE_OBLIQUE',
#     sparse_oblique_projection_density_factor=1.0,
#     sparse_oblique_normalization='MIN_MAX',
#     sparse_oblique_weights='CONTINUOUS',
#     categorical_algorithm='CART',
#     winner_take_all=True,
#     max_depth=20,
#     min_examples=10,
# )

###Train on Catagorical Dataset

In [None]:
tuned_rf_for_cat_features = ydf.RandomForestLearner(
    label='Survived',
    features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
              ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
              ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
              # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
    ],
    **best_params_simple_rf_custom_tuner_cat,
)

tuned_rf_cat = tuned_rf_for_cat_features.train(cat_dataset)

tuned_rf_cat.describe()

Train model on 891 examples
Model trained in 0:00:00.508102


###Train on Numeric Dataset

In [None]:
tuned_rf_for_num_features = ydf.RandomForestLearner(
    label='Survived',
    features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
              ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
              ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
              # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
    ],
    **best_params_simple_rf_custom_tuner_num,
)

tuned_rf_num = tuned_rf_for_num_features.train(num_dataset)

tuned_rf_num.describe()

Train model on 891 examples
Model trained in 0:00:00.909748


## Simple GBT Model

In [None]:
simple_gbt = ydf.GradientBoostedTreesLearner(
    label='Survived',
)

###Train on Catagorical Dataset

In [None]:
simple_gbt_cat = simple_gbt.train(cat_dataset)

simple_gbt_cat.describe()

Train model on 891 examples
Model trained in 0:00:00.398170


###Train on Numeric Dataset

In [None]:
simple_gbt_num = simple_gbt.train(num_dataset)

simple_gbt_num.describe()

Train model on 891 examples
Model trained in 0:00:00.246430


## GBT Model with specific features

In [None]:
simple_gbt_custom = ydf.GradientBoostedTreesLearner(
    label='Survived',
    features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
              ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
              ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
              # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
    ],
)

###Train on Catagorical Dataset

In [None]:
simple_gbt_custom_cat = simple_gbt_custom.train(cat_dataset)

simple_gbt_custom_cat.describe()

Train model on 891 examples
Model trained in 0:00:00.347630


###Train on Numeric Dataset

In [None]:
simple_gbt_custom_num = simple_gbt_custom.train(num_dataset)

simple_gbt_custom_num.describe()

Train model on 891 examples
Model trained in 0:00:00.305957


## GBT Model with specific features and use Tuner

In [None]:
tuner = ydf.RandomSearchTuner(num_trials=50, automatic_search_space=True)

simple_gbt_custom_tuner = ydf.GradientBoostedTreesLearner(
    label='Survived',
    features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
              ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
              ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
              # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
    ],
    tuner=tuner,
)

###Train on Catagorical Dataset

In [None]:
simple_gbt_custom_tuner_cat = simple_gbt_custom_tuner.train(cat_dataset)

best_trial_simple_gbt_custom_tuner_cat = pd.DataFrame(
    simple_gbt_custom_tuner_cat.hyperparameter_optimizer_logs().trials
    ).sort_values(by=['score'], ascending=[False]).set_index(['score'])

best_params_simple_gbt_custom_tuner_cat = best_trial_simple_gbt_custom_tuner_cat.iloc[0].params

# best_params_simple_gbt_custom_tuner_cat = {k: literal_eval(str(v)) for k, v in best_params_simple_gbt_custom_tuner_cat.items()}
for k, v in best_params_simple_gbt_custom_tuner_cat.items():
  if v == 'true':
    best_params_simple_gbt_custom_tuner_cat[k] = True
  elif v == 'false':
    best_params_simple_gbt_custom_tuner_cat[k] = False

simple_gbt_custom_tuner_cat.describe()

Train model on 891 examples
Model trained in 0:00:31.480392


trial,score,duration,split_axis,sparse_oblique_projection_density_factor,sparse_oblique_normalization,sparse_oblique_weights,categorical_algorithm,growing_strategy,max_num_nodes,sampling_method,subsample,shrinkage,min_examples,use_hessian_gain,num_candidate_attributes_ratio,max_depth
10,-0.549328,6.67134,SPARSE_OBLIQUE,1.0,MIN_MAX,BINARY,CART,BEST_FIRST_GLOBAL,16.0,RANDOM,0.6,0.1,10,True,0.2,
24,-0.559741,16.2187,SPARSE_OBLIQUE,4.0,STANDARD_DEVIATION,BINARY,CART,BEST_FIRST_GLOBAL,512.0,RANDOM,0.9,0.05,5,True,0.5,
33,-0.564545,23.1212,SPARSE_OBLIQUE,5.0,MIN_MAX,BINARY,RANDOM,BEST_FIRST_GLOBAL,64.0,RANDOM,0.9,0.05,20,False,1.0,
27,-0.567786,20.2411,SPARSE_OBLIQUE,4.0,NONE,CONTINUOUS,CART,LOCAL,,RANDOM,0.9,0.02,5,True,0.2,8.0
8,-0.56819,5.80116,SPARSE_OBLIQUE,3.0,STANDARD_DEVIATION,CONTINUOUS,RANDOM,BEST_FIRST_GLOBAL,256.0,RANDOM,0.9,0.05,10,True,0.9,
30,-0.570067,21.3133,SPARSE_OBLIQUE,2.0,NONE,CONTINUOUS,RANDOM,LOCAL,,RANDOM,0.8,0.05,7,False,1.0,8.0
23,-0.574696,15.6882,SPARSE_OBLIQUE,5.0,STANDARD_DEVIATION,CONTINUOUS,RANDOM,BEST_FIRST_GLOBAL,32.0,RANDOM,0.6,0.1,5,True,1.0,
9,-0.576142,6.41434,SPARSE_OBLIQUE,5.0,STANDARD_DEVIATION,BINARY,RANDOM,LOCAL,,RANDOM,0.9,0.05,20,True,0.9,8.0
1,-0.576572,2.28268,SPARSE_OBLIQUE,2.0,NONE,BINARY,RANDOM,BEST_FIRST_GLOBAL,128.0,RANDOM,0.8,0.02,10,True,0.9,
15,-0.583428,9.44793,SPARSE_OBLIQUE,2.0,MIN_MAX,BINARY,RANDOM,BEST_FIRST_GLOBAL,16.0,RANDOM,1.0,0.1,5,True,0.5,


###Train on Numeric Dataset

In [None]:
simple_gbt_custom_tuner_num = simple_gbt_custom_tuner.train(num_dataset)

best_trial_simple_gbt_custom_tuner_num = pd.DataFrame(
    simple_gbt_custom_tuner_num.hyperparameter_optimizer_logs().trials
    ).sort_values(by=['score'], ascending=[False]).set_index(['score'])

best_params_simple_gbt_custom_tuner_num = best_trial_simple_gbt_custom_tuner_num.iloc[0].params

# best_params_simple_gbt_custom_tuner_num = {k: literal_eval(str(v)) for k, v in best_params_simple_gbt_custom_tuner_num.items()}
for k, v in best_params_simple_gbt_custom_tuner_num.items():
  if v == 'true':
    best_params_simple_gbt_custom_tuner_num[k] = True
  elif v == 'false':
    best_params_simple_gbt_custom_tuner_num[k] = False

simple_gbt_custom_tuner_num.describe()

Train model on 891 examples
Model trained in 0:00:29.726830


trial,score,duration,split_axis,sparse_oblique_projection_density_factor,sparse_oblique_normalization,sparse_oblique_weights,categorical_algorithm,growing_strategy,max_num_nodes,sampling_method,subsample,shrinkage,min_examples,use_hessian_gain,num_candidate_attributes_ratio,max_depth
41,-0.573409,26.391,SPARSE_OBLIQUE,5.0,STANDARD_DEVIATION,BINARY,RANDOM,BEST_FIRST_GLOBAL,512.0,RANDOM,0.6,0.05,10,False,0.9,
7,-0.575559,3.95206,SPARSE_OBLIQUE,3.0,STANDARD_DEVIATION,CONTINUOUS,CART,LOCAL,,RANDOM,0.8,0.1,7,False,0.9,4.0
25,-0.57575,16.1613,SPARSE_OBLIQUE,5.0,STANDARD_DEVIATION,BINARY,RANDOM,BEST_FIRST_GLOBAL,64.0,RANDOM,0.8,0.02,7,True,1.0,
8,-0.580309,4.50117,SPARSE_OBLIQUE,3.0,STANDARD_DEVIATION,CONTINUOUS,RANDOM,BEST_FIRST_GLOBAL,256.0,RANDOM,0.9,0.05,10,True,0.9,
42,-0.580454,26.6551,SPARSE_OBLIQUE,2.0,MIN_MAX,BINARY,CART,BEST_FIRST_GLOBAL,32.0,RANDOM,0.9,0.1,10,False,0.2,
30,-0.580645,19.6177,SPARSE_OBLIQUE,2.0,NONE,CONTINUOUS,RANDOM,LOCAL,,RANDOM,0.8,0.05,7,False,1.0,8.0
18,-0.582297,12.102,SPARSE_OBLIQUE,2.0,STANDARD_DEVIATION,BINARY,RANDOM,LOCAL,,RANDOM,1.0,0.02,5,True,0.5,8.0
6,-0.583891,3.77905,SPARSE_OBLIQUE,5.0,NONE,CONTINUOUS,RANDOM,BEST_FIRST_GLOBAL,256.0,RANDOM,0.8,0.05,5,False,0.5,
10,-0.585861,5.27555,SPARSE_OBLIQUE,1.0,MIN_MAX,BINARY,CART,BEST_FIRST_GLOBAL,16.0,RANDOM,0.6,0.1,10,True,0.2,
23,-0.586373,14.3368,SPARSE_OBLIQUE,5.0,STANDARD_DEVIATION,CONTINUOUS,RANDOM,BEST_FIRST_GLOBAL,32.0,RANDOM,0.6,0.1,5,True,1.0,


## GBT Model with specific features and Tuning

In [None]:
# tuned_gbt = ydf.GradientBoostedTreesLearner(
#     label='Survived',
#     features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
#               # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
#               ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
#     ],
#     split_axis='SPARSE_OBLIQUE',
#     sparse_oblique_projection_density_factor=4.0,
#     sparse_oblique_normalization='MIN_MAX',
#     sparse_oblique_weights='CONTINUOUS',
#     categorical_algorithm='CART',
#     growing_strategy='LOCAL',
#     # max_num_nodes=512,
#     sampling_method='RANDOM',
#     subsample=1.0,
#     shrinkage=0.1,
#     min_examples=5,
#     use_hessian_gain=True,
#     num_candidate_attributes_ratio=0.5,
#     max_depth=3
# )

###Train on Catagorical Dataset

In [None]:
tuned_gbt_for_cat_features = ydf.GradientBoostedTreesLearner(
    label='Survived',
    features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
              ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
              ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
              # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
    ],
    **best_params_simple_gbt_custom_tuner_cat,
)

tuned_gbt_cat = tuned_gbt_for_cat_features.train(cat_dataset)

tuned_gbt_cat.describe()

Train model on 891 examples
Model trained in 0:00:00.237725


###Train on Numeric Dataset

In [None]:
tuned_gbt_for_num_features = ydf.GradientBoostedTreesLearner(
    label='Survived',
    features=[ydf.Feature('Pclass', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Sex', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Embarked', ydf.Semantic.CATEGORICAL),
              ydf.Feature('AgeGroup', ydf.Semantic.CATEGORICAL),
              ydf.Feature('FareGroup', ydf.Semantic.CATEGORICAL),
              # ydf.Feature('Alone', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Deck', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Title', ydf.Semantic.CATEGORICAL),
              ydf.Feature('Relatives', ydf.Semantic.NUMERICAL),
    ],
    **best_params_simple_gbt_custom_tuner_num,
)

tuned_gbt_num = tuned_gbt_for_num_features.train(num_dataset)

tuned_gbt_num.describe()

Train model on 891 examples
Model trained in 0:00:00.638433


#Model Evaluation

## Model Comparing

In [None]:
model_results = pd.DataFrame({
    'Model': ['simple_rf_cat', 'simple_rf_num', 'simple_rf_custom_cat', 'simple_rf_custom_num',
              'simple_rf_custom_tuner_cat', 'simple_rf_custom_tuner_num', 'tuned_rf_cat', 'tuned_rf_num',
              'simple_gbt_cat', 'simple_gbt_num', 'simple_gbt_custom_cat', 'simple_gbt_custom_num',
              'simple_gbt_custom_tuner_cat', 'simple_gbt_custom_tuner_num', 'tuned_gbt_cat', 'tuned_gbt_num'],
    'Accuracy': [simple_rf_cat.self_evaluation().accuracy, simple_rf_num.self_evaluation().accuracy,
              simple_rf_custom_cat.self_evaluation().accuracy, simple_rf_custom_num.self_evaluation().accuracy,
              simple_rf_custom_tuner_cat.self_evaluation().accuracy, simple_rf_custom_tuner_num.self_evaluation().accuracy,
              tuned_rf_cat.self_evaluation().accuracy, tuned_rf_num.self_evaluation().accuracy,
              simple_gbt_cat.self_evaluation().accuracy, simple_gbt_num.self_evaluation().accuracy,
              simple_gbt_custom_cat.self_evaluation().accuracy, simple_gbt_custom_num.self_evaluation().accuracy,
              simple_gbt_custom_tuner_cat.self_evaluation().accuracy, simple_gbt_custom_tuner_num.self_evaluation().accuracy,
              tuned_gbt_cat.self_evaluation().accuracy, tuned_gbt_num.self_evaluation().accuracy],
    'Loss': [simple_rf_cat.self_evaluation().loss, simple_rf_num.self_evaluation().loss,
              simple_rf_custom_cat.self_evaluation().loss, simple_rf_custom_num.self_evaluation().loss,
              simple_rf_custom_tuner_cat.self_evaluation().loss, simple_rf_custom_tuner_num.self_evaluation().loss,
              tuned_rf_cat.self_evaluation().loss, tuned_rf_num.self_evaluation().loss,
              simple_gbt_cat.self_evaluation().loss, simple_gbt_num.self_evaluation().loss,
              simple_gbt_custom_cat.self_evaluation().loss, simple_gbt_custom_num.self_evaluation().loss,
              simple_gbt_custom_tuner_cat.self_evaluation().loss, simple_gbt_custom_tuner_num.self_evaluation().loss,
              tuned_gbt_cat.self_evaluation().loss, tuned_gbt_num.self_evaluation().loss]
    })

best_acc_model_result = model_results.sort_values(by='Accuracy', ascending=False)
best_acc_model_result = best_acc_model_result.set_index('Accuracy')

best_loss_model_result = model_results.sort_values(by='Loss', ascending=True)
best_loss_model_result = best_loss_model_result.set_index('Loss')

In [None]:
best_acc_model_result

Unnamed: 0_level_0,Model,Loss
Accuracy,Unnamed: 1_level_1,Unnamed: 2_level_1
0.917808,simple_gbt_custom_tuner_cat,0.549328
0.917808,simple_gbt_custom_tuner_num,0.573409
0.917808,tuned_gbt_cat,0.549328
0.917808,tuned_gbt_num,0.573409
0.890411,simple_gbt_cat,0.602754
0.890411,simple_gbt_custom_cat,0.61368
0.890411,simple_gbt_custom_num,0.61456
0.876712,simple_gbt_num,0.640582
0.838384,simple_rf_custom_tuner_num,1.698868
0.838384,tuned_rf_num,1.698868


In [None]:
best_loss_model_result

Unnamed: 0_level_0,Model,Accuracy
Loss,Unnamed: 1_level_1,Unnamed: 2_level_1
0.549328,simple_gbt_custom_tuner_cat,0.917808
0.549328,tuned_gbt_cat,0.917808
0.573409,simple_gbt_custom_tuner_num,0.917808
0.573409,tuned_gbt_num,0.917808
0.602754,simple_gbt_cat,0.890411
0.61368,simple_gbt_custom_cat,0.890411
0.61456,simple_gbt_custom_num,0.890411
0.640582,simple_gbt_num,0.876712
1.405666,tuned_rf_cat,0.830527
1.476285,simple_rf_cat,0.828283


## Cross Validation

In [None]:
evaluation_simple_rf_cat = simple_rf.cross_validation(cat_dataset, folds=10)
evaluation_simple_rf_num = simple_rf.cross_validation(num_dataset, folds=10)

evaluation_simple_rf_custom_cat = simple_rf_custom.cross_validation(cat_dataset, folds=10)
evaluation_simple_rf_custom_num = simple_rf_custom.cross_validation(num_dataset, folds=10)

evaluation_simple_rf_custom_tuner_cat = simple_rf_custom_tuner.cross_validation(cat_dataset, folds=10)
evaluation_simple_rf_custom_tuner_num = simple_rf_custom_tuner.cross_validation(num_dataset, folds=10)

evaluation_tuned_rf_cat = tuned_rf_for_cat_features.cross_validation(cat_dataset, folds=10)
evaluation_tuned_rf_num = tuned_rf_for_num_features.cross_validation(cat_dataset, folds=10)

evaluation_simple_gbt_cat = simple_gbt.cross_validation(cat_dataset, folds=10)
evaluation_simple_gbt_num = simple_gbt.cross_validation(num_dataset, folds=10)

evaluation_simple_gbt_custom_cat = simple_gbt_custom.cross_validation(cat_dataset, folds=10)
evaluation_simple_gbt_custom_num = simple_gbt_custom.cross_validation(num_dataset, folds=10)

evaluation_simple_gbt_custom_tuner_cat = simple_gbt_custom_tuner.cross_validation(cat_dataset, folds=10)
evaluation_simple_gbt_custom_tuner_num = simple_gbt_custom_tuner.cross_validation(num_dataset, folds=10)

evaluation_tuned_gbt_cat = tuned_gbt_for_cat_features.cross_validation(cat_dataset, folds=10)
evaluation_tuned_gbt_num = tuned_gbt_for_num_features.cross_validation(num_dataset, folds=10)

cross_val_results = pd.DataFrame({
    'Model': ['simple_rf_cat', 'simple_rf_num', 'simple_rf_custom_cat', 'simple_rf_custom_num',
              'simple_rf_custom_tuner_cat', 'simple_rf_custom_tuner_num', 'tuned_rf_cat', 'tuned_rf_num',
              'simple_gbt_cat', 'simple_gbt_num', 'simple_gbt_custom_cat', 'simple_gbt_custom_num',
              'simple_gbt_custom_tuner_cat', 'simple_gbt_custom_tuner_num', 'tuned_gbt_cat', 'tuned_gbt_num'],
    'Accuracy': [evaluation_simple_rf_cat.accuracy, evaluation_simple_rf_num.accuracy,
              evaluation_simple_rf_custom_cat.accuracy, evaluation_simple_rf_custom_num.accuracy,
              evaluation_simple_rf_custom_tuner_cat.accuracy, evaluation_simple_rf_custom_tuner_num.accuracy,
              evaluation_tuned_rf_cat.accuracy, evaluation_tuned_rf_num.accuracy,
              evaluation_simple_gbt_cat.accuracy, evaluation_simple_gbt_num.accuracy,
              evaluation_simple_gbt_custom_cat.accuracy, evaluation_simple_gbt_custom_num.accuracy,
              evaluation_simple_gbt_custom_tuner_cat.accuracy, evaluation_simple_gbt_custom_tuner_num.accuracy,
              evaluation_tuned_gbt_cat.accuracy, evaluation_tuned_gbt_num.accuracy],
    'Loss': [evaluation_simple_rf_cat.loss, evaluation_simple_rf_num.loss,
              evaluation_simple_rf_custom_cat.loss, evaluation_simple_rf_custom_num.loss,
              evaluation_simple_rf_custom_tuner_cat.loss, evaluation_simple_rf_custom_tuner_num.loss,
              evaluation_tuned_rf_cat.loss, evaluation_tuned_rf_num.loss,
              evaluation_simple_gbt_cat.loss, evaluation_simple_gbt_num.loss,
              evaluation_simple_gbt_custom_cat.loss, evaluation_simple_gbt_custom_num.loss,
              evaluation_simple_gbt_custom_tuner_cat.loss, evaluation_simple_gbt_custom_tuner_num.loss,
              evaluation_tuned_gbt_cat.loss, evaluation_tuned_gbt_num.loss]
    })

best_acc_cross_val_result = cross_val_results.sort_values(by='Accuracy', ascending=False)
best_acc_cross_val_result = best_acc_cross_val_result.set_index('Accuracy')

best_loss_cross_val_result = cross_val_results.sort_values(by='Loss', ascending=True)
best_loss_cross_val_result = best_loss_cross_val_result.set_index('Loss')

In [None]:
best_acc_cross_val_result

Unnamed: 0_level_0,Model,Loss
Accuracy,Unnamed: 1_level_1,Unnamed: 2_level_1
0.838384,tuned_rf_num,1.330357
0.836139,simple_rf_custom_num,1.276949
0.836139,simple_rf_custom_tuner_num,1.350423
0.835017,simple_rf_cat,1.272424
0.835017,simple_rf_custom_cat,1.265638
0.835017,tuned_rf_cat,1.161004
0.832772,simple_rf_custom_tuner_cat,1.434167
0.830527,simple_rf_num,1.286467
0.830527,tuned_gbt_num,0.425898
0.829405,simple_gbt_custom_tuner_num,0.425491


In [None]:
best_loss_cross_val_result

Unnamed: 0_level_0,Model,Accuracy
Loss,Unnamed: 1_level_1,Unnamed: 2_level_1
0.414344,simple_gbt_num,0.826038
0.414845,simple_gbt_custom_num,0.826038
0.4156,simple_gbt_custom_tuner_cat,0.824916
0.419113,tuned_gbt_cat,0.821549
0.423133,simple_gbt_custom_cat,0.826038
0.423994,simple_gbt_cat,0.823793
0.425491,simple_gbt_custom_tuner_num,0.829405
0.425898,tuned_gbt_num,0.830527
1.161004,tuned_rf_cat,0.835017
1.265638,simple_rf_custom_cat,0.835017


# Prediction

In [None]:
prediction = simple_gbt_custom_tuner_num.predict(pred_num_dataset)

# prediction

In [None]:
kaggle_predictions = pd.DataFrame({
    "PassengerId": pred_num_dataset.index,
    "Survived": (prediction >= 0.5).astype(int)
})

kaggle_predictions.to_csv('/content/drive/MyDrive/datasets/titanic/submission.csv', index=False)

In [None]:
evaluation_simple_gbt_custom_tuner_num

Label \ Pred,0,1
0,488,91
1,61,251
