# Titanic Dataset: Predicting Survivability

## Plan of Attack

## Import Libaries

In [112]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')


In [113]:
train = pd.read_csv('datasets/titanic_train.csv')
X = train.drop('Survived', axis=1)
y = train['Survived']

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [115]:
features_remove = ['Name', 'Ticket', 'Cabin', 'Fare']
X.drop(features_remove, axis=1, inplace=True)

In [116]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [117]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [118]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numerical_features),('cat', categorical_transformer, categorical_features)])

In [119]:
SEED=123
lr = LogisticRegression(random_state=SEED,solver='liblinear')
knn = KNN()
dt = DecisionTreeClassifier(random_state=SEED)
gaussian_nb= GaussianNB()
randomforest=RandomForestClassifier(random_state=SEED)
xgb = XGBClassifier(verbosity = 0)
# Define a list called classifier that contains the tuples (classifier_name, classifier)
classifiers = [('Logistic Regression', lr),('K Nearest Neighbours', knn),('Classification Tree', dt),('Gaussian Naive Bayes', gaussian_nb),('Random Forest', randomforest), ('XGBoost', xgb)]

In [120]:
for cls_name, clf in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', clf)])
    pipe.fit(X_train, y_train)
    accuracies = cross_val_score(pipe, X = X_train, y = y_train, cv = 10)
    print(cls_name + " Accuracy: {:.2f} %".format(accuracies.mean()*100))
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Logistic Regression Accuracy: 80.39 %
Standard Deviation: 3.21 %
K Nearest Neighbours Accuracy: 79.79 %
Standard Deviation: 5.98 %
Classification Tree Accuracy: 72.31 %
Standard Deviation: 5.58 %
Gaussian Naive Bayes Accuracy: 78.74 %
Standard Deviation: 3.57 %
Random Forest Accuracy: 79.80 %
Standard Deviation: 5.41 %
XGBoost Accuracy: 79.21 %
Standard Deviation: 5.52 %


In [126]:
params = {
        'xgb__min_child_weight': [1, 5, 10],
        'xgb__gamma': [0.5, 1, 1.5, 2, 5],
        'xgb__subsample': [0.6, 0.8, 1.0],
        'xgb__colsample_bytree': [0.6, 0.8, 1.0],
        'xgb__max_depth': [3, 4, 5]
        }

In [128]:
pipe = Pipeline(steps=[('preprocessor', preprocessor), ('xgb', xgb)])
pipe.fit(X_train, y_train)
grid_search = GridSearchCV(pipe,
                           params,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 83.84 %
Best Parameters: {'xgb__colsample_bytree': 1.0, 'xgb__gamma': 1.5, 'xgb__max_depth': 3, 'xgb__min_child_weight': 5, 'xgb__subsample': 1.0}


In [130]:
test = pd.read_csv('datasets/titanic_test.csv')
xgb_upd = XGBClassifier(colsample_bytree=1, gamma=1.5, max_depth=3, min_child_weight=5, subsample=1)
pipe = Pipeline(steps=[('preprocessor', preprocessor), ('xgb', xgb_upd)])
pipe.fit(X,y)
test.drop(features_remove, axis=1, inplace=True)
y_final_pred = pipe.predict(test)


In [131]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': y_final_pred})
output.to_csv('submission.csv', index=False)