# Import Libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

# Import Data

In [2]:
titanic = pd.read_csv('/content/drive/MyDrive/Titanic.csv')
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Preprocessing

## Columns to remove:
 - PassengerId
 - Name
 - Ticket
 - Cabin
 - SibSp
 - Parch

In [3]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
titanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1, inplace=True)

In [5]:
titanic

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked
0,0,3,male,22.0,7.2500,S
1,1,1,female,38.0,71.2833,C
2,1,3,female,26.0,7.9250,S
3,1,1,female,35.0,53.1000,S
4,0,3,male,35.0,8.0500,S
...,...,...,...,...,...,...
886,0,2,male,27.0,13.0000,S
887,1,1,female,19.0,30.0000,S
888,0,3,female,,23.4500,S
889,1,1,male,26.0,30.0000,C


In [6]:
titanic.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
Fare          0
Embarked      2
dtype: int64

Remove Age column as there are more NA values

In [7]:
titanic.drop('Age', axis=1, inplace=True)

In [8]:
titanic

Unnamed: 0,Survived,Pclass,Sex,Fare,Embarked
0,0,3,male,7.2500,S
1,1,1,female,71.2833,C
2,1,3,female,7.9250,S
3,1,1,female,53.1000,S
4,0,3,male,8.0500,S
...,...,...,...,...,...
886,0,2,male,13.0000,S
887,1,1,female,30.0000,S
888,0,3,female,23.4500,S
889,1,1,male,30.0000,C


## Fill NA Values

In [9]:
#fill NA values using forward fill
titanic.fillna(method='ffill', inplace=True)

# Extract X and Y

In [10]:
Y = titanic['Survived']
X = titanic.drop('Survived', axis=1)

In [11]:
Y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [12]:
X

Unnamed: 0,Pclass,Sex,Fare,Embarked
0,3,male,7.2500,S
1,1,female,71.2833,C
2,3,female,7.9250,S
3,1,female,53.1000,S
4,3,male,8.0500,S
...,...,...,...,...
886,2,male,13.0000,S
887,1,female,30.0000,S
888,3,female,23.4500,S
889,1,male,30.0000,C


# Encode Categorical Values

In [13]:
categorical_columns = X.columns[X.dtypes == 'object'] #categorical columns
numerical_columns = X.columns[X.dtypes != 'object'] #categorical columns
categorical_columns, numerical_columns

(Index(['Sex', 'Embarked'], dtype='object'),
 Index(['Pclass', 'Fare'], dtype='object'))

In [14]:
encoder = OneHotEncoder(sparse=False)
encoder.fit(X[categorical_columns])
new_feature_names = encoder.get_feature_names_out()

In [15]:
X[new_feature_names] = encoder.transform(X[categorical_columns])

In [16]:
X.drop(categorical_columns, axis=1, inplace=True)
X

Unnamed: 0,Pclass,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3,7.2500,0.0,1.0,0.0,0.0,1.0
1,1,71.2833,1.0,0.0,1.0,0.0,0.0
2,3,7.9250,1.0,0.0,0.0,0.0,1.0
3,1,53.1000,1.0,0.0,0.0,0.0,1.0
4,3,8.0500,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
886,2,13.0000,0.0,1.0,0.0,0.0,1.0
887,1,30.0000,1.0,0.0,0.0,0.0,1.0
888,3,23.4500,1.0,0.0,0.0,0.0,1.0
889,1,30.0000,0.0,1.0,1.0,0.0,0.0


# Scale Numerical Columns

In [17]:
numerical_columns

Index(['Pclass', 'Fare'], dtype='object')

In [18]:
scaler = MinMaxScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])
X

Unnamed: 0,Pclass,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1.0,0.014151,0.0,1.0,0.0,0.0,1.0
1,0.0,0.139136,1.0,0.0,1.0,0.0,0.0
2,1.0,0.015469,1.0,0.0,0.0,0.0,1.0
3,0.0,0.103644,1.0,0.0,0.0,0.0,1.0
4,1.0,0.015713,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
886,0.5,0.025374,0.0,1.0,0.0,0.0,1.0
887,0.0,0.058556,1.0,0.0,0.0,0.0,1.0
888,1.0,0.045771,1.0,0.0,0.0,0.0,1.0
889,0.0,0.058556,0.0,1.0,1.0,0.0,0.0


# Train Test Split

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [20]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((712, 7), (712,), (179, 7), (179,))

# Gradient Boost Classifier

In [21]:
model = GradientBoostingClassifier()# classifier

#hyperparameters
hyper_params = dict()
hyper_params['n_estimators'] = [10, 50]
hyper_params['learning_rate'] = [0.01, 0.1]
hyper_params['subsample'] = [0.5, 0.7]
hyper_params['max_depth'] = [3, 7, 9]

#k-fold cross validation
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

#grid search cross validation
grid_search = GridSearchCV(estimator=model, param_grid=hyper_params, n_jobs=-1, cv=kfold, scoring='accuracy', verbose=3)

In [22]:
#fit
result = grid_search.fit(X_train, Y_train)

Fitting 30 folds for each of 24 candidates, totalling 720 fits


In [23]:
print("Best Accuracy: %f using %s" % (result.best_score_, result.best_params_))

Best Accuracy: 0.817847 using {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}


In [24]:
print('Test Accuracy :', grid_search.score(X_test, Y_test))

Test Accuracy : 0.8212290502793296


# AdaBoost Classifier

In [25]:
model = AdaBoostClassifier() #classifier

#hyperparameters
hyper_params = dict()
hyper_params['n_estimators'] = [10, 50, 100, 500]
hyper_params['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]

#kfold cross validation
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

#grid search cross validation
grid_search = GridSearchCV(estimator=model, param_grid=hyper_params, n_jobs=-1, cv=kfold, scoring='accuracy')

In [26]:
# fit
result = grid_search.fit(X_train, Y_train)

In [27]:
print("Best Accuracy: %f using %s" % (result.best_score_, result.best_params_))

Best Accuracy: 0.808470 using {'learning_rate': 1.0, 'n_estimators': 500}


In [28]:
print('Test Accuracy :', grid_search.score(X_test, Y_test))

Test Accuracy : 0.7988826815642458
