In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
dataset_path = '../DATASET/INITIAL_DATASET'
train_path = os.path.join(dataset_path, 'train.csv')
test_path = os.path.join(dataset_path, 'test.csv')

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [3]:
train_df['source'] = 'train'
test_df['source'] = 'test'

full_df = pd.concat([train_df, test_df], ignore_index=True)

In [4]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
test_df.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [6]:
result = pd.concat([train_df.isna().sum(),train_df.isna().mean()],axis=1)
result

Unnamed: 0,0,1
PassengerId,0,0.0
Survived,0,0.0
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
Age,177,0.198653
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0
Fare,0,0.0


In [7]:
df = full_df.copy()
le = LabelEncoder()
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

In [8]:
df['Embarked'] = le.fit_transform(df['Embarked'])
df['Embarked'] = df['Embarked'].replace(3, np.nan)
ohetransform = ohe.fit_transform(df[['Sex']])
df = pd.concat([df, ohetransform], axis=1).drop(columns=['Sex'])

In [9]:
features = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch', 'Embarked', 'Sex_male', 'Sex_female']
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df[features]), columns=features)

In [10]:
df_imputed['Embarked'] = df_imputed['Embarked'].round().astype(int)
df_imputed['Embarked'] = le.inverse_transform(df_imputed['Embarked'])

In [11]:
full_df['Embarked'] = full_df['Embarked'].fillna(df_imputed['Embarked'])


In [12]:
full_df = pd.concat([full_df, ohetransform], axis=1).drop(columns=['Sex'])

In [13]:
ohetransform = ohe.fit_transform(full_df[['Embarked']])
full_df = pd.concat([full_df, ohetransform], axis=1).drop(columns=['Embarked'])

In [14]:
full_df.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
source          object
Sex_female     float64
Sex_male       float64
Embarked_C     float64
Embarked_Q     float64
Embarked_S     float64
dtype: object

In [15]:
full_df['Fare'] = full_df['Fare'].fillna(df_imputed['Fare'])

In [16]:
full_df['Age_missing'] = full_df['Age'].isna().astype(int)

In [17]:
full_df['Age'] = full_df['Age'].fillna(df_imputed['Age'])

In [18]:
train_df = full_df[full_df['source']=='train'].copy().drop(columns=['source', 'Name', 'Ticket', 'Cabin'])
test_df = full_df[full_df['source']=='test'].copy().drop(columns=['source', 'Name', 'Ticket', 'Cabin'])


In [19]:
train_df.columns
features = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Age_missing']
X_train = train_df[features]
y_train = train_df['Survived']

X_test = test_df[features]

In [188]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Integer


search_space = {
    'max_depth': Integer(2, 8),
    'learning_rate': Real(0.001, 0.3, prior='log-uniform'),
    'subsample': Real(0.5, 1.0),
    'colsample_bytree': Real(0.5, 1.0),
    'colsample_bylevel': Real(0.5, 1.0),
    'colsample_bynode': Real(0.5, 1.0),
    'reg_alpha':Real(0, 10),
    'reg_lambda':Real(0, 10),
    'gamma':Real(0, 10)
    
}

# Create XGBoost classifier
xgb = XGBClassifier(n_estimators=100, objective='binary:logistic', random_state=42)

# Perform Bayesian optimization
bayes_search = BayesSearchCV(estimator=xgb, search_spaces=search_space, n_iter=25, cv=9, n_jobs=-1, verbose=2)
bayes_search.fit(X_train, y_train)

# Print best parameters
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")
# test_predictions = model.predict(X_test)
# submission = pd.DataFrame({
#     'PassengerId': test_df['PassengerId'],
#     'Survived': test_predictions
# })
# submission.to_csv('titanic_xgb_bayes_submission.csv', index=False)

Fitting 9 folds for each of 1 candidates, totalling 9 fits
[CV] END colsample_bylevel=0.6939185835583411, colsample_bynode=0.9439631697347468, colsample_bytree=0.7750296444211875, gamma=6.764934529529937, learning_rate=0.014680977684047865, max_depth=3, reg_alpha=5.236009673385807, reg_lambda=4.758675912472655, subsample=0.6306681857221109; total time=   0.1s
[CV] END colsample_bylevel=0.6939185835583411, colsample_bynode=0.9439631697347468, colsample_bytree=0.7750296444211875, gamma=6.764934529529937, learning_rate=0.014680977684047865, max_depth=3, reg_alpha=5.236009673385807, reg_lambda=4.758675912472655, subsample=0.6306681857221109; total time=   0.1s
[CV] END colsample_bylevel=0.6939185835583411, colsample_bynode=0.9439631697347468, colsample_bytree=0.7750296444211875, gamma=6.764934529529937, learning_rate=0.014680977684047865, max_depth=3, reg_alpha=5.236009673385807, reg_lambda=4.758675912472655, subsample=0.6306681857221109; total time=   0.1s
[CV] END colsample_bylevel=0.693

In [29]:
# Predict on the test set using the best found model
test_predictions = bayes_search.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_predictions
})

submission.to_csv('titanic_xgb_bayes_submission_longer.csv', index=False)