In [111]:
import pandas as pd
import numpy as np
import matplotlib as mpl

In [137]:
# Read in data and use dummies!

df = pd.read_csv('./train.csv')
dummies_sex = pd.get_dummies(df['Sex'])
dummies_embarked = pd.get_dummies(df['Embarked'], prefix='embarked')

df = pd.concat([df, dummies_sex, dummies_embarked], axis=1).drop(columns=['Sex', 'Embarked', 'Ticket'])

In [138]:
features = ['Pclass', 'SibSp', 'Parch', 'Fare', 'female', 'male', 'embarked_C', 'embarked_Q', 'embarked_S']
target = 'Survived'

# Count NaN values
for feature in features:
    print(f'{feature}: {df[feature].isna().sum()}')

Pclass: 0
SibSp: 0
Parch: 0
Fare: 0
female: 0
male: 0
embarked_C: 0
embarked_Q: 0
embarked_S: 0


In [139]:
# from sklearn.model_selection import train_test_split
# train, test = train_test_split(df, test_size=0.2)

In [201]:
test = pd.read_csv('./test.csv')

dummies_sex = pd.get_dummies(test['Sex'])
dummies_embarked = pd.get_dummies(test['Embarked'], prefix='embarked')

test = pd.concat([test, dummies_sex, dummies_embarked], axis=1).drop(columns=['Sex', 'Embarked', 'Ticket'])

for feature in features:
    print(f'{feature}: {test[feature].isna().sum()}')

pclass3_mean = test[test['Pclass'] == 3]['Fare'].mean()
test[test['Fare'].isna()]

test['Fare'] = test['Fare'].fillna(pclass3_mean)
test.isna().sum()

Pclass: 0
SibSp: 0
Parch: 0
Fare: 1
female: 0
male: 0
embarked_C: 0
embarked_Q: 0
embarked_S: 0


PassengerId      0
Pclass           0
Name             0
Age             86
SibSp            0
Parch            0
Fare             0
Cabin          327
female           0
male             0
embarked_C       0
embarked_Q       0
embarked_S       0
dtype: int64

In [207]:
# Model test func

def test_model(train, test, model, features, target, scores_on=True):
    model.fit(train[features], train[target])
    y_hat = model.predict(test[features])
    
    
    if scores_on:
        train_score = model.score(train[features], train[target])
        test_score = model.score(test[features], test[target])

        print("The training score is:", train_score)
        print("The testing  score is:", test_score)
    
    else:
        return y_hat
    

In [208]:
# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=4)

test_model(train, test, rfc, features, target, scores_on=False)

test['Survived'] = y_hat
test[['PassengerId', 'Survived']].to_csv('./rfc_sol.csv')

In [209]:
# kNN classifier
from sklearn.neighbors import KNeighborsClassifier
k = 5

knn = KNeighborsClassifier(k)
test_model(train, test, knn, features, target, scores_on=False)

test['Survived'] = y_hat
test[['PassengerId', 'Survived']].to_csv('./knn_sol.csv')

In [210]:
# Gaussian Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

y_hat = test_model(train, test, gnb, features,target, scores_on=False)
test['Survived'] = y_hat
test[['PassengerId', 'Survived']].to_csv('./gnb_sol.csv')

In [211]:
# SVM
from sklearn.svm import SVC
svm = SVC()
test_model(train, test, svm, features, target, scores_on=False)

test['Survived'] = y_hat
test[['PassengerId', 'Survived']].to_csv('./svc_sol.csv')