In [134]:
import pandas as pd
import numpy as np

import seaborn as sns

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [115]:
X_train = pd.read_csv('train.csv')
y_train = X_train['Survived']
X_train.drop(['Ticket','Cabin'], axis=1, inplace=True)

In [116]:
X_test = pd.read_csv('test.csv')

In [117]:
sex_remap = {'male':0, 'female':1}
X_train['Sex'].replace(sex_remap, inplace=True)
X_train['Sex'].fillna(0, inplace=True)

embarked_remap = {'S': 0, 'C': 1, 'Q':2}
X_train['Embarked'].replace(embarked_remap, inplace=True)
X_train['Embarked'].fillna(0, inplace=True)

X_train['Age'].fillna(X_train['Age'].median(), inplace=True)

X_train['SibSp'].fillna(0, inplace=True)

X_train['Parch'].fillna(0, inplace=True)

X_train['Fare'].fillna(X_train['Fare'].median(), inplace=True)

In [118]:
X_train['HasFamily'] = np.logical_or(X_train['SibSp'] > 0, X_train['Parch'] > 0).astype(int)
X_train['NameLength'] = X_train['Name'].str.len()
X_train['NameLength'].fillna(X_train['NameLength'].median(), inplace=True)

In [119]:
X_features = X_train[['Pclass', 'Sex', 'Fare', 'HasFamily', 'NameLength']].values
target = X_train['Survived'].values

In [120]:
X_test['Sex'].replace(sex_remap, inplace=True)
X_test['Sex'].fillna(0, inplace=True)

X_test['Embarked'].replace(embarked_remap, inplace=True)
X_test['Embarked'].fillna(0, inplace=True)

X_test['Age'].fillna(X_test['Age'].median(), inplace=True)

X_test['SibSp'].fillna(0, inplace=True)

X_test['Parch'].fillna(0, inplace=True)

X_test['Fare'].fillna(X_test['Fare'].median(), inplace=True)

In [121]:
X_test['HasFamily'] = np.logical_or(X_test['SibSp'] > 0, X_test['Parch'] > 0).astype(int)
X_test['NameLength'] = X_test['Name'].str.len()
X_test['NameLength'].fillna(X_test['NameLength'].median(), inplace=True)

In [122]:
test_features = X_test[['Pclass', 'Sex', 'Fare', 'HasFamily', 'NameLength']].values

In [123]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')
voting_clf.fit(X_features, target)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFor...f',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False))],
         n_jobs=1, voting='hard', weights=None)

In [124]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_features, target)
    y_pred = clf.predict(test_features)

In [125]:
y_export = X_test[['PassengerId']].copy()
y_export['Survived'] = y_pred

y_export.to_csv('titanic_sub.csv', index=False)

Splitting training set to train and validation set for visualization purposes

In [126]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_features, target, random_state=42)

Bagging Ensembles

In [129]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_valid)

In [130]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_valid, y_pred))

0.811659192825


In [131]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_valid)
print(accuracy_score(y_valid, y_pred_tree))

0.789237668161
