In [157]:
import pandas as pd
import numpy as np
import copy

In [76]:
df = pd.read_csv('data/train.csv')
print(list(df))

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


# Model

Steps:
1) Import data
2) Scale feature set
3) Train Model

In [77]:
df_train = pd.read_csv('data/train_imputed.csv')
df_test = pd.read_csv('data/test_imputed.csv')

target = 'Survived'

In [102]:
print(len(df_train), len(df_test))

891 418


In [78]:
print(list(df_test))

['Age', 'SibSp', 'Parch', 'Fare', 'SibSp_binary', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Age_group_Adult', 'Age_group_Child', 'Age_group_Elderly', 'Age_group_Teenager', 'Age_group_Young Adult', 'PassengerId']


In [79]:
def scale_data(scaler, df_train, df_test, ls_features_to_scale):
    df_train_features = df_train[ls_features_to_scale]
    df_test_features = df_test[ls_features_to_scale]

    scaler.fit(df_train_features)

    train_features_scaled = scaler.transform(df_train_features)
    test_features_scaled = scaler.transform(df_test_features)

    return {'train_data': train_features_scaled, 'test_data': test_features_scaled}



In [80]:
def kaggle_format(predictions, df_test):
    ls_formatted_predictions = []
    for idx in range(0, len(predictions)):
        passenger_id = df_test.loc[df_test.index == idx, 'PassengerId'].values[0]
        ls_formatted_predictions.append([passenger_id, predictions[idx]])

    return ls_formatted_predictions

In [81]:
def save_kaggle_file(ls_formatted_predictions, filename_prefix):
    save_path = 'data/{}_submission.csv'.format(filename_prefix)
    pd.DataFrame(ls_formatted_predictions, columns=['PassengerId', 'Survived']).to_csv(save_path, index=False, encoding='utf-8')

In [82]:
print(sorted(list(df_train)))

['Age', 'Age_group_Adult', 'Age_group_Child', 'Age_group_Elderly', 'Age_group_Teenager', 'Age_group_Young Adult', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare', 'Parch', 'PassengerId', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp', 'SibSp_binary', 'Survived']


# Random Forest w/ Feature Importance

In [113]:
df_train = pd.read_csv('data/train_imputed.csv')
df_test = pd.read_csv('data/test_imputed.csv')

In [114]:
print(list(df_train))

['Age', 'SibSp', 'Parch', 'Fare', 'SibSp_binary', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Age_group_Adult', 'Age_group_Child', 'Age_group_Elderly', 'Age_group_Teenager', 'Age_group_Young Adult', 'Survived', 'PassengerId']


In [122]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt

In [123]:
forest = RandomForestClassifier(random_state=0)
df_train_target = df_train[target]
df_train_features = df_train.drop(['PassengerId', target], axis=1)
df_test_features = df_test.drop(['PassengerId'], axis=1)

In [124]:
forest.fit(df_train_features.values, df_train_target.values)

In [128]:
forest_importances = pd.Series(forest.feature_importances_, index=list(df_train_features))

importances = pd.DataFrame({'feature':df_train_features.columns,'importance':np.round(forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head(15)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
Age,0.228
Fare,0.182
Sex_male,0.152
Sex_female,0.14
Pclass_3,0.06
SibSp,0.041
Parch,0.036
Pclass_1,0.03
Embarked_S,0.017
Age_group_Child,0.017


## SVM 
- Data is small so SVM is good option

In [129]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [130]:
df_train = pd.read_csv('data/train_imputed.csv')
df_test = pd.read_csv('data/test_imputed.csv')
scaler = StandardScaler()

In [131]:
print(sorted(list(df_train)))

['Age', 'Age_group_Adult', 'Age_group_Child', 'Age_group_Elderly', 'Age_group_Teenager', 'Age_group_Young Adult', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare', 'Parch', 'PassengerId', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp', 'SibSp_binary', 'Survived']


In [132]:
ls_features_to_include = ['Fare', 'Age', 'Sex_female', 'Sex_male']

df_train_target = df_train[target]

In [133]:
dct_scaled_data = scale_data(scaler=scaler, df_train=df_train, df_test=df_test, ls_features_to_scale=ls_features_to_include)

df_train_scaled = dct_scaled_data['train_data']
df_test_scaled = dct_scaled_data['test_data']

In [134]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['linear']}, 
  {'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf']},
 ]
svc_classifier = SVC()

svc_grid_search = GridSearchCV(svc_classifier, param_grid=param_grid)

In [135]:
svc_grid_search.fit(df_train_scaled, df_train_target.values)

In [136]:
print(svc_grid_search.best_params_)

{'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}


In [137]:
predictions = svc_grid_search.predict(df_test_scaled)

In [138]:
ls_formatted_predictions = kaggle_format(predictions=predictions, df_test=df_test)
save_kaggle_file(ls_formatted_predictions, filename_prefix='svm')

# Decision Tree

In [88]:
df_train = pd.read_csv('data/train_imputed.csv')
df_test = pd.read_csv('data/test_imputed.csv')

In [96]:
print(list(df_train))

['Age', 'SibSp', 'Parch', 'Fare', 'SibSp_binary', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Age_group_Adult', 'Age_group_Child', 'Age_group_Elderly', 'Age_group_Teenager', 'Age_group_Young Adult', 'Survived', 'PassengerId']


In [90]:
from sklearn import tree

In [91]:
decision_tree_classifier = tree.DecisionTreeClassifier()

In [92]:
df_train_target = df_train[target]
df_train_features = df_train.drop(['PassengerId', target], axis=1)
df_test_features = df_test.drop(['PassengerId'], axis=1)

In [93]:
decision_tree_classifier = decision_tree_classifier.fit(df_train_features.values, df_train_target.values)

In [94]:
decision_tree_classifier.__dir__()

['criterion',
 'splitter',
 'max_depth',
 'min_samples_split',
 'min_samples_leaf',
 'min_weight_fraction_leaf',
 'max_features',
 'max_leaf_nodes',
 'random_state',
 'min_impurity_decrease',
 'class_weight',
 'ccp_alpha',
 'n_features_in_',
 'n_outputs_',
 'classes_',
 'n_classes_',
 'max_features_',
 'tree_',
 '__module__',
 '__annotations__',
 '__doc__',
 '_parameter_constraints',
 '__init__',
 'fit',
 'predict_proba',
 'predict_log_proba',
 '_more_tags',
 'set_fit_request',
 'set_predict_request',
 'set_predict_proba_request',
 'set_score_request',
 '__abstractmethods__',
 '_abc_impl',
 '_estimator_type',
 'score',
 '__dict__',
 '__weakref__',
 '__new__',
 '__repr__',
 '__hash__',
 '__str__',
 '__getattribute__',
 '__setattr__',
 '__delattr__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__reduce_ex__',
 '__reduce__',
 '__getstate__',
 '__subclasshook__',
 '__init_subclass__',
 '__format__',
 '__sizeof__',
 '__dir__',
 '__class__',
 'get_depth',
 'get_n_leav

In [51]:
predictions = decision_tree_classifier.predict(df_test_features.values)

In [52]:
ls_formatted_predictions = kaggle_format(predictions=predictions, df_test=df_test)
save_kaggle_file(ls_formatted_predictions, filename_prefix='decision_tree')

# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression

In [14]:
print(sorted(list(df_train)))

['Age', 'Age_group_Adult', 'Age_group_Child', 'Age_group_Elderly', 'Age_group_Teenager', 'Age_group_Young Adult', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare', 'Parch', 'PassengerId', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp', 'SibSp_binary', 'Survived']


In [12]:
logreg = LogisticRegression(random_state=0)

In [15]:
ls_features_to_keep = ['Parch', 'SibSp_binary', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Age']
df_train_target = df_train[target]
df_train_features = df_train[ls_features_to_keep]
df_test_features = df_test[ls_features_to_keep]

In [16]:
logreg = logreg.fit(df_train_features.values, df_train_target.values)

In [17]:
df_test_features.isnull().sum()

Parch           0
SibSp_binary    0
Pclass_1        0
Pclass_2        0
Pclass_3        0
Age             0
dtype: int64

In [18]:
predictions = logreg.predict(df_test_features.values)

In [19]:
ls_formatted_predictions = kaggle_format(predictions=predictions, df_test=df_test)
save_kaggle_file(ls_formatted_predictions, filename_prefix='logistic_regression')

# Neural Network

In [179]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

from keras import backend as K 

In [180]:
print(sorted(list(df_train)))

['Age', 'Age_group_Adult', 'Age_group_Child', 'Age_group_Elderly', 'Age_group_Teenager', 'Age_group_Young Adult', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare', 'Parch', 'PassengerId', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp', 'SibSp_binary', 'Survived']


In [181]:
ls_features = ['Age', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp_binary']

In [182]:
df_train_features = df_train[ls_features]
df_train_target = df_train[target]
df_test_features = df_test[ls_features]

In [183]:
print(len(df_train_features))

891


In [184]:
df_train_target = pd.get_dummies(df_train_target, dtype=float)
df_train_target

Unnamed: 0,0,1
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,1.0,0.0
...,...,...
886,1.0,0.0
887,0.0,1.0
888,1.0,0.0
889,0.0,1.0


In [203]:
num_inputs = len(list(df_train_features))
num_inputs

11

In [209]:

model = Sequential()
model.add(Dense(11, activation='relu', kernel_initializer='he_normal', input_shape=(num_inputs,)))
model.add(Dense(5, activation='relu', kernel_initializer='he_normal'))
#One hot encoded target variable, so softmax activation. If catcoded then would use sigmoid
model.add(Dense(2, activation='softmax'))

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 11)                132       
                                                                 
 dense_1 (Dense)             (None, 5)                 60        
                                                                 
 dense_2 (Dense)             (None, 2)                 12        
                                                                 
Total params: 204 (816.00 Byte)
Trainable params: 204 (816.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [210]:
model.fit(df_train_features.values, df_train_target.values, epochs=500, batch_size=16, verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.src.callbacks.History at 0x11e3cb710>

In [211]:
predictions = model.predict(df_test_features.values)
K.clear_session()

 1/14 [=>............................] - ETA: 0s



In [212]:
predictions = np.argmax(predictions, axis=1)

In [213]:
predictions

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [214]:
ls_formatted_predictions = kaggle_format(predictions=predictions, df_test=df_test)
ls_formatted_predictions

[[892, 0],
 [893, 0],
 [894, 0],
 [895, 0],
 [896, 0],
 [897, 0],
 [898, 1],
 [899, 0],
 [900, 1],
 [901, 0],
 [902, 0],
 [903, 0],
 [904, 1],
 [905, 0],
 [906, 1],
 [907, 1],
 [908, 0],
 [909, 0],
 [910, 0],
 [911, 0],
 [912, 0],
 [913, 0],
 [914, 1],
 [915, 0],
 [916, 1],
 [917, 0],
 [918, 1],
 [919, 0],
 [920, 0],
 [921, 0],
 [922, 0],
 [923, 0],
 [924, 0],
 [925, 0],
 [926, 0],
 [927, 0],
 [928, 0],
 [929, 0],
 [930, 0],
 [931, 0],
 [932, 0],
 [933, 0],
 [934, 0],
 [935, 1],
 [936, 1],
 [937, 0],
 [938, 0],
 [939, 0],
 [940, 1],
 [941, 0],
 [942, 0],
 [943, 0],
 [944, 1],
 [945, 1],
 [946, 0],
 [947, 0],
 [948, 0],
 [949, 0],
 [950, 0],
 [951, 1],
 [952, 0],
 [953, 0],
 [954, 0],
 [955, 1],
 [956, 1],
 [957, 1],
 [958, 1],
 [959, 0],
 [960, 0],
 [961, 1],
 [962, 1],
 [963, 0],
 [964, 0],
 [965, 0],
 [966, 1],
 [967, 1],
 [968, 0],
 [969, 1],
 [970, 0],
 [971, 1],
 [972, 0],
 [973, 0],
 [974, 0],
 [975, 0],
 [976, 0],
 [977, 0],
 [978, 1],
 [979, 1],
 [980, 1],
 [981, 0],
 [982, 0],

In [215]:
save_kaggle_file(ls_formatted_predictions, filename_prefix='neural_network')