In [1]:
! pip install optuna



In [2]:
import numpy as np
import pandas as pd
import os

import math
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from catboost import Pool, CatBoostClassifier, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import optuna



In [3]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

datasets=[train, test]

In [4]:
for dataset in datasets:
    dataset['Prefix'] = dataset['Name'].str.extract(r'(\b\w+\.)(?=\s)')
    dataset['Ticket_type']=dataset['Ticket'].apply(lambda x: x.split()[0] if len(x.split())>1 else 'No_type')
    dataset['Cabin_type']=dataset['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'No_type')
    dataset['Embarked'].fillna('No_type', inplace=True)
    dataset['Fare'].fillna(round(dataset['Fare'].mean(), 2), inplace=True)
    mean_age_by_prefix = dataset.groupby('Prefix')["Age"].mean()
    for prefix, mean_age in mean_age_by_prefix.items():
       dataset.loc[(dataset['Age'].isnull()) & (dataset['Prefix'] == prefix), 'Age'] = mean_age
    dataset['Age']=dataset['Age'].apply(lambda x: round(x) if pd.notnull(x) else dataset[dataset['Prefix']=='Miss.']['Age'].mean())
    dataset.drop(['Name','Ticket', 'Cabin'], axis=1, inplace=True)


In [5]:
nulls=train.isnull().sum(axis=0)
print('Train:', nulls[nulls>0])
nulls=test.isnull().sum(axis=0)
print('Test:', nulls[nulls>0])

Train: Series([], dtype: int64)
Test: Series([], dtype: int64)


In [6]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Prefix,Ticket_type,Cabin_type
0,1,0,3,male,22,1,0,7.2500,S,Mr.,A/5,No_type
1,2,1,1,female,38,1,0,71.2833,C,Mrs.,PC,C
2,3,1,3,female,26,0,0,7.9250,S,Miss.,STON/O2.,No_type
3,4,1,1,female,35,1,0,53.1000,S,Mrs.,No_type,C
4,5,0,3,male,35,0,0,8.0500,S,Mr.,No_type,No_type
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27,0,0,13.0000,S,Rev.,No_type,No_type
887,888,1,1,female,19,0,0,30.0000,S,Miss.,No_type,B
888,889,0,3,female,22,1,2,23.4500,S,Miss.,W./C.,No_type
889,890,1,1,male,26,0,0,30.0000,C,Mr.,No_type,C


In [7]:
categorical_features_indices=['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Prefix', 'Ticket_type', 'Cabin_type']

In [8]:
def objective(trial):
    X=train.drop('Survived', axis=1)
    y=train['Survived']
    X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)
    X_test=test

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    cat_cls = CatBoostClassifier(**param)

    cat_cls.fit(X_train, y_train, eval_set=[(X_validation, y_validation)], cat_features=categorical_features_indices,verbose=0, early_stopping_rounds=100)

    preds = cat_cls.predict(X_validation)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_validation, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50, timeout=360)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2023-08-17 16:08:43,768] A new study created in memory with name: no-name-676ba7f5-4dad-40a1-83de-274d5f097213
[I 2023-08-17 16:08:45,083] Trial 0 finished with value: 0.8295964125560538 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.038774359458212296, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.403844181935032}. Best is trial 0 with value: 0.8295964125560538.
[I 2023-08-17 16:08:46,312] Trial 1 finished with value: 0.8295964125560538 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.04860798973293406, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.8295964125560538.
[I 2023-08-17 16:08:47,339] Trial 2 finished with value: 0.7982062780269058 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.011006948965703114, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.8538262296869155}. Best is trial

Number of finished trials: 50
Best trial:
  Value: 0.8385650224215246
  Params: 
    objective: CrossEntropy
    colsample_bylevel: 0.05808333086019984
    depth: 9
    boosting_type: Plain
    bootstrap_type: Bayesian
    bagging_temperature: 9.483544173894378


In [9]:
accuracy =[]
model_names =[]


X=train.drop('Survived', axis=1)
y=train['Survived']
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)
X_test=test

model = CatBoostClassifier(verbose=False,random_state=42,
    objective= 'CrossEntropy',
    colsample_bylevel= 0.06695759969519763,
    depth= 12,
    boosting_type= 'Ordered',
    bootstrap_type= 'MVS')

model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation))
y_pred = model.predict(X_validation)
accuracy.append(round(accuracy_score(y_validation, y_pred),4))
print(classification_report(y_validation, y_pred))

model_names = ['Catboost_tuned']
result_df6 = pd.DataFrame({'Accuracy':accuracy}, index=model_names)
result_df6

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       134
           1       0.83      0.72      0.77        89

    accuracy                           0.83       223
   macro avg       0.83      0.81      0.82       223
weighted avg       0.83      0.83      0.83       223



Unnamed: 0,Accuracy
Catboost_tuned,0.8296


In [10]:
submission = pd.DataFrame()
submission['PassengerId'] = X_test['PassengerId']
submission['Survived'] = model.predict(X_test)
submission.to_csv('submission.csv', index=False)

In [11]:
! head submission.csv

PassengerId,Survived
892,0
893,0
894,0
895,0
896,0
897,0
898,0
899,0
900,1
