In [32]:
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 29 18:37:36 2019
https://www.kaggle.com/c/titanic/overview
"""

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import titanic_helper as H
import matplotlib.pyplot as plt

import os
os.chdir(r'/kaggle/working')

titanic_df = pd.read_csv("../input/titanic/train.csv")

*PassengerId* is our index.

*Name* and *Ticket* columns are not needed for training.

*Cabin* has null values (NaN) for 687 records out of 891. It doesn't seem to add much info anyways.
It would be better if we dropped it.


*Age* has null values for 177 records out of 891.
We should try to find meaningful values to replace it with.
I am replacing it with median based on *Sex*:
29 for male and 27 for female

*Embarked* has null values for 2 records out of 891:
1. We drop those 2 rows, or
2. Since 'S' occurs 644 times, 'C' 168 times and 'Q' 77 times,
   We can assume it is 'S'
As of now we drop them.

In [33]:
titanic_df.set_index('PassengerId', inplace=True)

titanic_df.drop("Name",axis=1, inplace=True)
titanic_df.drop("Ticket",axis=1, inplace=True)

titanic_df.drop("Cabin",axis=1, inplace=True)

titanic_df.loc[(titanic_df.Sex == "male") & (pd.isna(titanic_df.Age)), "Age"] = 29.0
titanic_df.loc[(titanic_df.Sex == "female") & (pd.isna(titanic_df.Age)), "Age"] = 27.0

titanic_df = titanic_df[pd.notnull(titanic_df["Embarked"])]

*Sex* and *Embarked* columns are nominal values and need to be One-Hot Encoded.
We then drop these columns from our dataframe, convert our dataframe into features and target, and then add the encoded columns to our features.

*Survived* is our target, rest are our features.

In [34]:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
titanic_df_enc = pd.DataFrame(enc.fit_transform(titanic_df[titanic_df.columns[titanic_df.dtypes==object].tolist()]))

titanic_df.drop("Sex",axis=1, inplace=True)
titanic_df.drop("Embarked",axis=1, inplace=True)

titanic_df_enc.index = titanic_df.index
titanic_df = pd.concat([titanic_df, titanic_df_enc], axis=1)

cols = titanic_df.columns.tolist()
cols[-5:] = ["Female", "Male", "Embarked-C", "Embarked-Q", "Embarked-S"]
titanic_df.columns = cols

y_train = np.array(titanic_df.iloc[:,0])
X_train = np.array(titanic_df.iloc[:,1:])

* Our data is now cleaned. We now preprocess it using Standard Scaler.

In [35]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)

We now fetch our best fine tuned algorithm:

In [36]:
best_svm = H.best_svm()
best_svm.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

We now process our test data.

In [37]:
test_df = pd.read_csv("../input/titanic/test.csv")
test_df.set_index('PassengerId', inplace=True)

test_df.drop("Name",axis=1, inplace=True)
test_df.drop("Ticket",axis=1, inplace=True)

test_df.drop("Cabin",axis=1, inplace=True)

test_df.loc[(test_df.Sex == "male") & (pd.isna(test_df.Age)), "Age"] = 29.0
test_df.loc[(test_df.Sex == "female") & (pd.isna(test_df.Age)), "Age"] = 27.0

test_df = test_df[pd.notnull(test_df["Embarked"])]

test_df_enc = pd.DataFrame(enc.transform(test_df[test_df.columns[test_df.dtypes==object].tolist()]))

test_df.drop("Sex",axis=1, inplace=True)
test_df.drop("Embarked",axis=1, inplace=True)

test_df_enc.index = test_df.index
test_df = pd.concat([test_df, test_df_enc], axis=1)

cols = test_df.columns.tolist()
cols[-5:] = ["Female", "Male", "Embarked-C", "Embarked-Q", "Embarked-S"]
test_df.columns = cols

#y_test = np.array(test_df.iloc[:,0])
X_test = np.array(test_df.iloc[:,:])
X_test[152][4] = 7.895 #Fare is missing so replace with median for Male, 3rd class, Embarked from S without SibSp and Parch
X_test = sc.transform(X_test)

We now make predictions on our test data.

In [38]:
pred = best_svm.predict(X_test)


pass_id = np.array(test_df.index)
pass_id = np.reshape(pass_id, (pass_id.shape[0],1))
pred = np.reshape(pred, (pred.shape[0],1))
res = pd.DataFrame(np.concatenate((pass_id , pred ), axis = 1))
res.columns=["PassengerId", "Survived"]
res.set_index("PassengerId", inplace=True)
res.to_csv(r"outputSVM.csv")

We now download our submission file.

In [41]:
from IPython.display import FileLink
FileLink(r"outputSVM.csv")

This gives us an accuracy of 79.4% on test data.