# Titanic Survival Predictions (Kaggle)

## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Importing Dataset

In [None]:
training_set = pd.read_csv("train.csv")
test_set = pd.read_csv("test.csv")
training_set.name = 'Training Set'
test_set.name = 'Test Set'
training_set.sample(3)

In [None]:
!pip install fasteda

## Dropping Irrelevant Columns

In [None]:
training_set = training_set.drop("PassengerId", axis='columns')
training_set = training_set.drop("Ticket", axis='columns')
test_set = test_set.drop("Ticket", axis='columns')

## Checking for missing values

In [None]:
def check_missing(df):
    for col in df.columns.tolist():
        print('{} column missing values: {}'.format(col, df[col].isnull().sum()))

for df in [training_set, test_set]:
    check_missing(df)
    print('\n')


In [None]:
training_set[training_set['Embarked'].isnull()]

## Dealing with Missing Values and Encoding

### Dealing with missing Embarked values

In [None]:
training_set['Embarked'].fillna('C', inplace=True)
training_set.sample(3)

### Encoding Embarked Values

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('Embarked_Transformer', OneHotEncoder(), ['Embarked'])], remainder='passthrough')
training_set = pd.DataFrame(ct.fit_transform(training_set), columns=[0, 1, 2, 'Survived', 'PClass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin'])
training_set.sample(2)
test_set['Survived'] = 0
test_set = pd.DataFrame(ct.transform(test_set), columns=[0, 1, 2, 'Survived', 'PClass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin'])
test_set = test_set.drop('Survived', axis='columns')


### Dealing with Cabin Missing Values and Keeping Deck Letter Only

In [None]:
training_set['Cabin'].fillna('Z', inplace=True)
test_set['Cabin'].fillna('Z', inplace=True)

training_set['Cabin'] = [str(i)[0] for i in training_set['Cabin']]
test_set['Cabin'] = [str(i)[0] for i in test_set['Cabin']]
training_set.sample(10)

### Encoding Cabin Column

In [None]:
from sklearn.preprocessing import LabelEncoder
le_cabin = LabelEncoder()
training_set['Cabin'] = le_cabin.fit_transform(training_set['Cabin'])
test_set['Cabin'] = le_cabin.transform(test_set['Cabin'])

### Keeping Titles Only in Name Column

In [None]:
training_set['Initial']=0
for i in training_set:
    training_set['Initial']=training_set.Name.str.extract('([A-Za-z]+)\.')

training_set['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

test_set['Initial']=0
for i in training_set:
    test_set['Initial']=training_set.Name.str.extract('([A-Za-z]+)\.')

test_set['Initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],inplace=True)

training_set['Initial'].value_counts()

training_set = training_set.drop('Name', axis='columns')
test_set = test_set.drop('Name', axis='columns')

training_set.sample(10)

### Label Encoding Initial and Sex Columns

In [None]:
le_initial = LabelEncoder()
training_set['Initial'] = le_initial.fit_transform(training_set['Initial'])
test_set['Initial'] = le_initial.transform(test_set['Initial'])
le_sex = LabelEncoder()
training_set['Sex'] = le_sex.fit_transform(training_set['Sex'])
test_set['Sex'] = le_sex.transform(test_set['Sex'])

In [None]:
test_set.sample(2)

### Dealing with Fare missing values

In [None]:
from sklearn.impute import KNNImputer
imputer_test = KNNImputer(missing_values=np.nan, n_neighbors=2, weights='uniform')
test_set[['Fare']] = imputer_test.fit_transform(test_set[['Fare']])
training_set.sample(3)

### Dealing with Age missing values

In [None]:
training_set.groupby('Initial')['Age'].mean()

### Setting Age missing value as mean of initial's group

In [None]:
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==0), 'Age'] = 5
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==1), 'Age'] = 22
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==2), 'Age'] = 33
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==3), 'Age'] = 36
training_set.loc[(training_set['Age'].isnull()) & (training_set['Initial']==4), 'Age'] = 46

test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==0), 'Age'] = 5
test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==1), 'Age'] = 22
test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==2), 'Age'] = 33
test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==3), 'Age'] = 36
test_set.loc[(test_set['Age'].isnull()) & (test_set['Initial']==4), 'Age'] = 46

In [None]:
training_set.sample(3)

## Feature Scaling & Train-Test Split



In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X = training_set.drop('Survived', axis='columns')
X = X.drop('Cabin', axis='columns')
test_set = test_set.drop('Cabin', axis='columns')
y = training_set['Survived']
X.columns = X.columns.astype(str)
test_set.columns = test_set.columns.astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)
X_test = sc.transform(X_test)
test_set = sc.transform(test_set)

## Building ANN and Confusion Matrix

In [None]:
!pip install keras-tuner

In [None]:
import tensorflow as tf
import keras_tuner
from sklearn.metrics import confusion_matrix, accuracy_score

def build_model(hp):
  ann = tf.keras.models.Sequential()

  ann.add(tf.keras.layers.Dense(hp.Int('units_1', min_value=16, max_value=256, step=4), input_shape=(9,), activation='relu', kernel_initializer='lecun_uniform'))
  ann.add(tf.keras.layers.Dropout(hp.Float('dropout_rate_1', min_value=0.0, max_value=0.9, step=0.1)))
  ann.add(tf.keras.layers.Dense(hp.Int('units_2', min_value=16, max_value=256, step=4), activation='relu', kernel_initializer='lecun_uniform'))
  ann.add(tf.keras.layers.Dropout(hp.Float('dropout_rate_2', min_value=0.0, max_value=0.9, step=0.1)))
  ann.add(tf.keras.layers.Dense(hp.Int('units_3', min_value=16, max_value=256, step=4), activation='relu', kernel_initializer='lecun_uniform'))
  ann.add(tf.keras.layers.Dropout(hp.Float('dropout_rate_3', min_value=0.0, max_value=0.9, step=0.1)))
  ann.add(tf.keras.layers.Dense(hp.Int('units_4', min_value=0, max_value=256, step=4)))
  ann.add(tf.keras.layers.Dropout(hp.Float('dropout_rate_4', min_value=0.0, max_value=0.9, step=0.1)))

  ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
  ann.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp.Float('learning_rate', min_value=0.001, max_value=0.05, step=0.001)), loss='binary_crossentropy', metrics=['accuracy'])
  return ann

X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)
X_val = np.asarray(X_val).astype(np.float32)
y_val = np.asarray(y_val).astype(np.float32)

tuner = keras_tuner.BayesianOptimization(build_model,
                                 objective=['accuracy','val_accuracy'],
                                 max_trials=250)
tuner.search(X_train, y_train, validation_data=(X_val, y_val))
best_models = tuner.get_best_models(num_models=5)
tuner.results_summary()


## Running Tuned ANN

In [None]:
for j in range(0,5):
  best_models[j].fit(X_train, y_train, epochs=30, batch_size=32)


## Confusion Matrix & Accuracy Score for ANN

In [None]:
for j in range(0,5):
  y_init_pred = best_models[j].predict(X_test) > 0.5
  y_init_pred = np.array([int(i) for i in y_init_pred])
  cfm = confusion_matrix(y_test, y_init_pred)
  acc = (accuracy_score(y_test, y_init_pred))
  print(cfm)
  print(f"Accuracy score for model {j}", acc)

## Applying 10-Fold CV for ANN

In [None]:
!pip install tensorflow scikeras scikit-learn

In [None]:
from sklearn.model_selection import cross_val_score
from scikeras.wrappers import KerasClassifier
keras_clf = KerasClassifier(best_models[0])
cvs = cross_val_score(keras_clf, X=X, y=y, scoring='accuracy', cv=10, n_jobs=-1, error_score='raise')
print("Accuracy: {:.2f}%".format(cvs.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs.std()*100))

## Applying Gaussian NB Model

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

## Applying 10-fold CV for GNB

In [None]:
cvs_gnb = cross_val_score(gnb, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_gnb.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_gnb.std()*100))

## Applying XGBoost model

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

## Applying 10-Fold CV for XGBoost

In [None]:
cvs_cb = cross_val_score(classifier, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_cb.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_cb.std()*100))

In [None]:
!pip install catboost

## Applying CatBoost

In [None]:
from catboost import CatBoostClassifier
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV

catboost_clf = CatBoostClassifier()
catboost_hparams = {
    'n_estimators': [
        50, 100, 150, 200, 250, 300
    ],
    'depth': [
        4, 6, 8, 10, 12
    ],
    'learning_rate': stats.uniform(
        loc=0.001, scale=0.099
    ),
    'l2_leaf_reg': stats.uniform(
        loc=1, scale=9
    )
}

catboost_cv = RandomizedSearchCV(
    estimator=catboost_clf,
    param_distributions=catboost_hparams,
    n_iter=50,
    scoring='accuracy',
    n_jobs=-1,
    cv=10,
    verbose=0,
)

catboost_cv.fit(X=X, y=y)

In [None]:
from sklearn.model_selection import cross_val_score
cvs_cat = cross_val_score(catboost_cv, X=X, y=y, scoring='accuracy', cv=10, n_jobs=-1)

In [None]:
print("Accuracy: {:.2f}%".format(cvs_cat.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_cat.std()*100))

## Applying Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion='entropy', max_depth=15, n_jobs=-1)
rfc.fit(X_train, y_train)

## Applying 10-Fold CV for RFC

In [None]:
cvs_rfc = cross_val_score(rfc, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_rfc.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_rfc.std()*100))

## Implementing SVM Classification

In [None]:
from sklearn.svm import SVC
svc = SVC(kernel='rbf', C=0.25, gamma=0.2)
svc.fit(X_train, y_train)


## Applying 10-Fold CV for SVC

In [None]:
cvs_svc = cross_val_score(svc, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_svc.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_svc.std()*100))

## Applying Grid Search CV for Hyperparameter Tuning of SVC

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{
    'C': [0.25, 0.5, 0.75, 1],
    'kernel': ['linear'],
              },
              {
                'C': [0.25, 0.5, 0.75, 1],
                'kernel': ['rbf'],
                'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
              }]
gs = GridSearchCV(classifier, param_grid=parameters, scoring='accuracy', cv=10, n_jobs = -1)
gs.fit(X_train, y_train)
best_accuracy = gs.best_score_
best_params = gs.best_params_
print("Best Accuracy: {:.2f}%".format(best_accuracy*100))
print("Best Parameters: {}".format(best_params))

## Applying KNN Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors=6, weights='distance')
knc.fit(X_train, y_train)

## Applying 10-fold CV for KNN Classification

In [None]:
cvs_knc = cross_val_score(knc, X=X_train, y=y_train, scoring='accuracy', cv=10, n_jobs=-1)
print("Accuracy: {:.2f}%".format(cvs_svc.mean()*100))
print("Accuracy Standard Deviation: {:.2f}%".format(cvs_svc.std()*100))

## Output

ANN Model is Chosen due to better accuracy average score and a lower standard deviation during CV. Now we can use ANN to predict test set

In [None]:
y_pred = best_models[3].predict(test_set) > 0.5
y_pred = [int(i) for i in y_pred]
print(y_pred)

In [None]:
test = pd.read_csv("test.csv")
passenid = test['PassengerId']
print(passenid)
print(y_pred)
output = pd.DataFrame({'PassengerId': passenid, 'Survived': y_pred})
output.to_csv('submission_without_fit2.csv', index=False)