# Modeling and Validating

### Split Train, Validation and Test Data

In [67]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation

In [2]:
df = pd.read_csv('df_clean.csv')

In [3]:
df_train = df[df['Train'] == 1]
df_test = df[df['Train'] == 0]

In [4]:
X = df_train.drop(columns=['Survived'])
Y = df_train['Survived']

In [5]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [6]:
X_test = df_test.drop(columns=['Survived'])

In [21]:
X_train.head()

Unnamed: 0,PassengerId,Train,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
331,332,1,1,0,0,0,1,0.567832,0.0,0.0,0.055628,0,0,1
733,734,1,0,1,0,0,1,0.285983,0.0,0.0,0.025374,0,0,1
382,383,1,0,0,1,0,1,0.398722,0.0,0.0,0.015469,0,0,1
704,705,1,0,0,1,0,1,0.323563,0.125,0.0,0.01533,0,0,1
813,814,1,0,0,1,1,0,0.07303,0.5,0.222222,0.061045,0,0,1


### Decision Tree

In [7]:
dt = DecisionTreeClassifier(max_features=5, max_depth=8, min_samples_leaf=4, random_state=42)
dt.fit(X_train, Y_train)

Y_train_pred = dt.predict(X_train)
Y_val_pred = dt.predict(X_val)

print('Decision tree train and validation accuracy: ', 
      accuracy_score(Y_train, Y_train_pred), accuracy_score(Y_val, Y_val_pred))

Decision tree train and validation accuracy:  0.8595505617977528 0.8156424581005587


### Random Forest

In [8]:
rf = RandomForestClassifier(n_estimators=8, max_depth=8, min_samples_leaf=4, random_state=42)
rf.fit(X_train, Y_train)

Y_train_pred = rf.predict(X_train)
Y_val_pred = rf.predict(X_val)

print('Random forest train and validation accuracy: ', 
      accuracy_score(Y_train, Y_train_pred), accuracy_score(Y_val, Y_val_pred))

Random forest train and validation accuracy:  0.8553370786516854 0.8044692737430168


### Logistic Regression

In [9]:
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=42, max_iter=200)
lr.fit(X_train, Y_train)

Y_train_pred = lr.predict(X_train)
Y_val_pred = lr.predict(X_val)

print('Logistic regression train and validation accuracy: ', 
      accuracy_score(Y_train, Y_train_pred), accuracy_score(Y_val, Y_val_pred))

Logistic regression train and validation accuracy:  0.8061797752808989 0.7877094972067039


### Naive Bayes

In [10]:
nb = GaussianNB()
nb.fit(X_train, Y_train)

Y_train_pred = nb.predict(X_train)
Y_val_pred = nb.predict(X_val)

print('Naive bayes train and validation accuracy: ', 
      accuracy_score(Y_train, Y_train_pred), accuracy_score(Y_val, Y_val_pred))

Naive bayes train and validation accuracy:  0.7907303370786517 0.7653631284916201


### Neural Network

In [83]:
X_train_nn = X_train.drop(columns=['PassengerId', 'Train'])
X_val_nn = X_val.drop(columns=['PassengerId', 'Train'])
X_test_nn = X_test.drop(columns=['PassengerId', 'Train'])

model = Sequential()
model.add(Dense(10, input_dim=X_train_nn.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(5, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam')

model.fit(X_train_nn, Y_train, verbose=0, epochs=10000, batch_size=500, validation_data=(X_val_nn, Y_val))

Y_train_pred = np.round(model.predict(X_train_nn))
Y_val_pred = np.round(model.predict(X_val_nn))

print('Neural networks train and validation accuracy: ', 
      accuracy_score(Y_train, Y_train_pred), accuracy_score(Y_val, Y_val_pred))

Neural networks train and validation accuracy:  0.851123595505618 0.8268156424581006


# Submission
Since the Neural Network had the best accuracy of the models I used its predictions as the first submission, which gave me a score of 0.79425.

In [86]:
submission = X_test[['PassengerId']].copy()
submission['Survived'] = np.round(model.predict(X_test_nn), decimals=0).astype(int)

submission.to_csv('submission.csv', index=False)