In [25]:
import pandas as pd
import copy

In [26]:
df = pd.read_csv('data/train.csv')
print(list(df))

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


# Model

Steps:
1) Import data
2) Scale feature set
3) Train Model

In [27]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [28]:
print(list(df_test))

['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [29]:
def scale_data(scaler, df_train, df_test, ls_features_to_scale):
    df_train_features = df_train[ls_features_to_scale]
    df_test_features = df_test[ls_features_to_scale]

    scaler.fit(df_train_features)

    train_features_scaled = scaler.transform(df_train_features)
    test_features_scaled = scaler.transform(df_test_features)

    return {'train_data': train_features_scaled, 'test_data': test_features_scaled}



In [30]:
def kaggle_format(predictions, df_test):
    ls_formatted_predictions = []
    for idx in range(0, len(predictions)):
        passenger_id = df_test.loc[df_test.index == idx, 'PassengerId'].values[0]
        ls_formatted_predictions.append([passenger_id, predictions[idx]])

    return ls_formatted_predictions

In [44]:
def one_hot_encode(df, ls_variables_to_encode):
    df_encoded = copy.copy(df)

    for var in ls_variables_to_encode:
        df_encoded = pd.get_dummies(df, columns=var, dtype=int, prefix=var)

    return df_encoded

In [45]:
def save_kaggle_file(ls_formatted_predictions, filename_prefix):
    save_path = 'data/{}_submission.csv'.format(filename_prefix)
    pd.DataFrame(ls_formatted_predictions, columns=['PassengerId', 'Survived']).to_csv(save_path, index=False, encoding='utf-8')

## SVM 
- Data is small so SVM is good option

In [46]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [47]:
scaler = StandardScaler()

In [39]:
ls_features_to_include = ['Parch', 'SibSp', 'Pclass']
target = 'Survived'

df_train_target = df_train[target]

In [11]:
dct_scaled_data = scale_data(scaler=scaler, df_train=df_train, df_test=df_test, ls_features_to_scale=ls_features_to_include)

df_train_scaled = dct_scaled_data['train_data']
df_test_scaled = dct_scaled_data['test_data']

In [12]:
svc_classifier = SVC(class_weight='balanced')

In [13]:
svc_classifier.fit(df_train_scaled, df_train_target.values)

In [14]:
predictions = svc_classifier.predict(df_test_scaled)

In [15]:
ls_formatted_predictions = kaggle_format(predictions=predictions, df_test=df_test)
save_kaggle_file(ls_formatted_predictions, filename_prefix='svm')

# Decision Tree

In [48]:
print(list(df_train))

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [49]:
df_train_encoded = one_hot_encode(df=df_train, ls_variables_to_encode=['Embarked'])
df_test_encoded = one_hot_encode(df=df_test, ls_variables_to_encode=['Embarked'])

TypeError: Input must be a list-like for parameter `columns`

In [50]:
df_train_encoded

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [18]:
from sklearn import tree

In [19]:
decision_tree_classifier = tree.DecisionTreeClassifier()

In [23]:
ls_features_to_drop = ['PassengerId', 'Name', 'Sex', 'Ticket']
df_train_features = df_train.drop(ls_features_to_drop, axis=1)
df_train_target = df_train[target]
df_test_features = df_test.drop(ls_features_to_drop, axis=1)

In [24]:
decision_tree_classifier = decision_tree_classifier.fit(df_train_features.values, df_train_target.values)

ValueError: could not convert string to float: 'C85'