# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from xgboost import XGBClassifier


# Preprocessing

In [None]:
df_titanic = pd.read_csv('/content/drive/MyDrive/Data Mining/HW2/titanic.csv')
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df_titanic['Age'].fillna(df_titanic['Age'].median(), inplace=True)
df_titanic['Embarked'].fillna(df_titanic['Embarked'].mode().iloc[0], inplace=True)
df_titanic.dropna(axis=0, inplace=True)

In [None]:
df_titanic['FamilySize'] = df_titanic['SibSp'] + df_titanic['Parch'] + 1

In [None]:
df_titanic.drop(['Parch', 'SibSp', 'Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)

In [None]:
df_titanic = pd.get_dummies(df_titanic, columns = ['Embarked', 'Sex', 'Cabin'])
df_titanic.head()

Unnamed: 0,Survived,Pclass,Age,Fare,FamilySize,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male,...,Cabin_E8,Cabin_F E69,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T
1,1,1,38.0,71.2833,2,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,35.0,53.1,2,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,54.0,51.8625,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
10,1,3,4.0,16.7,3,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,0
11,1,1,58.0,26.55,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X = df_titanic.loc[:, df_titanic.columns!='Survived']
y = df_titanic.loc[:, 'Survived']

In [None]:
X.shape, y.shape

((204, 156), (204,))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((142, 156), (62, 156), (142,), (62,))

# Model Creation

In [None]:
param_grid = {
'max_depth': range(2, 20),
'learning_rate': [0.01, 0.05, 0.09, 0.2, 0.3, 0.4, 0.9],
'alpha': np.arange(0, 100, 0.1)
}

xgb = XGBClassifier()
g_search = GridSearchCV(estimator = xgb, param_grid = param_grid, cv = 3, verbose = 0, return_train_score=True)
g_search.fit(X_train, y_train)
print(g_search.best_params_)

# Prediction

In [None]:
y_pred = g_search.predict(X_test)

In [None]:
print(accuracy_score(y_pred, y_test))

0.8387096774193549
