In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/train.csv')
print(list(df))

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


# Model

Steps:
1) Import data
2) Scale feature set
3) Train Model

In [3]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [4]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [5]:
print(list(df_test))

['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [6]:
def scale_data(scaler, df_train, df_test, ls_features_to_scale):
    df_train_features = df_train[ls_features_to_scale]
    df_test_features = df_test[ls_features_to_scale]

    scaler.fit(df_train_features)

    train_features_scaled = scaler.transform(df_train_features)
    test_features_scaled = scaler.transform(df_test_features)

    return {'train_data': train_features_scaled, 'test_data': test_features_scaled}



In [7]:
def kaggle_format(predictions, df_test):
    ls_formatted_predictions = []
    for idx in range(0, len(predictions)):
        passenger_id = df_test.loc[df_test.index == idx, 'PassengerId'].values[0]
        ls_formatted_predictions.append([passenger_id, predictions[idx]])

    return ls_formatted_predictions

In [8]:
def save_kaggle_file(ls_formatted_predictions, filename_prefix):
    save_path = 'data/{}_submission.csv'.format(filename_prefix)
    pd.DataFrame(ls_formatted_predictions, columns=['PassengerId', 'Survived']).to_csv(save_path, index=False, encoding='utf-8')

## SVM 
- Data is small so SVM is good option

In [9]:
scaler = StandardScaler()

In [10]:
ls_features_to_include = ['Parch', 'SibSp', 'Pclass']
target = 'Survived'

df_train_target = df_train[target]

In [11]:
dct_scaled_data = scale_data(scaler=scaler, df_train=df_train, df_test=df_test, ls_features_to_scale=ls_features_to_include)

df_train_scaled = dct_scaled_data['train_data']
df_test_scaled = dct_scaled_data['test_data']

In [12]:
svc_classifier = SVC(class_weight='balanced')

In [13]:
svc_classifier.fit(df_train_scaled, df_train_target.values)

In [14]:
predictions = svc_classifier.predict(df_test_scaled)

In [15]:
ls_formatted_predictions = kaggle_format(predictions=predictions, df_test=df_test)
save_kaggle_file(ls_formatted_predictions, filename_prefix='svm')