In [1]:
# Figure out where you are
import os
os.getcwd()

import pandas as pd
import numpy as np

# Read in your files from Kaggle 
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
id_survived = pd.read_csv('gender_submission.csv')

# Drop bad or uninformative columns
train_df_use = train_df.drop(columns = ['Name', 'Ticket', 'Cabin'])
test_df_use = test_df.drop(columns = ['Name', 'Ticket', 'Cabin'])

# Check point
train_df_use.head()
test_df_use.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [2]:
len(test_df)

418

In [3]:
# Reencode columns
train_df_use['Sex'] = train_df_use['Sex'].map({'male':0, 'female':1})
test_df_use['Sex'] = test_df_use['Sex'].map({'male':0, 'female':1})

train_df_use['Embarked'] = train_df_use['Embarked'].map({'S':1, 'C':2, 'Q':3}).fillna(0)
test_df_use['Embarked'] = test_df_use['Embarked'].map({'S':1, 'C':2, 'Q':3}).fillna(0)

# Check point
train_df_use['Embarked'].value_counts()
test_df_use['Embarked'].value_counts()

Embarked
1    270
2    102
3     46
Name: count, dtype: int64

In [4]:
## Remove 0 from embarked column to clean up training data, we go down from 891 t0 889 entries
train_df_use = train_df_use[train_df_use['Embarked'] != 0]
len(train_df_use)

889

In [5]:
# we dropped any Null ages and end up with 712 entries
train_df_use = train_df_use.dropna(subset=['Age'])
len(train_df_use)

712

In [6]:
# merge with information on passengers to get the survived values for test data
test_df_use = pd.merge(id_survived, test_df_use, how = 'inner', on = 'PassengerId')

In [7]:
len(test_df_use)

418

In [8]:
X_test = test_df_use.drop(columns = ['Survived'])
X_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,0,34.5,0,0,7.8292,3
1,893,3,1,47.0,1,0,7.0,1
2,894,2,0,62.0,0,0,9.6875,3
3,895,3,0,27.0,0,0,8.6625,1
4,896,3,1,22.0,1,1,12.2875,1


In [9]:
# Create X_train, y_train, X_test, y_test
X_train = train_df_use.drop(columns = ['Survived'])
y_train = train_df_use['Survived']
X_test = test_df_use.drop(columns = ['Survived'])
y_test = test_df_use['Survived']

In [10]:
y_test

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64

In [11]:
len(test_df_use)

418

In [12]:
X_test.isnull().any()

PassengerId    False
Pclass         False
Sex            False
Age             True
SibSp          False
Parch          False
Fare            True
Embarked       False
dtype: bool

In [15]:
# replace NA values in Age and Fare with means 
X_test['Age'] = X_test['Age'].fillna(X_test['Age'].mean())
X_test['Fare'] = X_test['Fare'].fillna(X_test['Fare'].mean())

In [16]:
X_test.isnull().any()

PassengerId    False
Pclass         False
Sex            False
Age            False
SibSp          False
Parch          False
Fare           False
Embarked       False
dtype: bool

In [17]:
# import all necessary machine learning algorithms 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier()
}

# Fit each classifier and evaluate the accuracy
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy:.2f}')

Logistic Regression Accuracy: 0.91
Support Vector Machine Accuracy: 0.65
Random Forest Accuracy: 0.77
K-Nearest Neighbors Accuracy: 0.65
Naive Bayes Accuracy: 0.92
Gradient Boosting Accuracy: 0.79


In [18]:
# Take the best model and create the needed output for Kaggle
clf = GaussianNB() 
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9210526315789473


In [19]:
y_pred = pd.DataFrame(y_pred, columns = ['Survived'])

In [21]:
y_pred['PassengerId'] = np.arange(892, 1310, 1)

In [22]:
len(y_pred)

418

In [25]:
y_pred.head()
new_order = ['PassengerId', 'Survived']
y_pred = y_pred[new_order]
y_pred.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,0


In [26]:
y_pred.to_csv('kaggle_titanic_results_abood.csv', index = False)