In [1]:
import sys

import sklearn
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (LogisticRegression, Ridge, Lasso)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

%matplotlib inline

warnings.filterwarnings('ignore')

pd.options.display.max_rows = 20

In [2]:
data = pd.read_csv('../data/train-titanic.csv', index_col='PassengerId')

In [3]:
data_trans = data.copy()

In [4]:
def name_len(df):
    df['NameLen'] = df['Name'].str.len()

In [5]:
# name_len(data_trans)

In [6]:
def name_binning(df):
    df['ShortName'] = (df.NameLen <= 25).astype(float)
    df['LongName'] = (df.NameLen > 25).astype(float)

In [7]:
# name_binning(data_trans)

In [8]:
def add_title(df):
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.', expand=False)

In [9]:
major_titles = ['Mr', 'Mrs', 'Miss', 'Master']

def edit_titles(df):
    df.loc[df.Title == 'Mlle', 'Title'] = 'Miss'
    df.loc[df.Title == 'Mme', 'Title']  = 'Mrs'
    df.loc[df.Title == 'Ms', 'Title']   = 'Miss'
    df.loc[~df.Title.isin(major_titles), 'Title'] = 'Unknown'
    
major_titles.append('Unknown')

In [10]:
add_title(data_trans)
edit_titles(data_trans)

In [11]:
data_trans.groupby('Title').count().sort_values('Age')

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Unknown,23,23,23,23,22,23,23,23,23,10,23
Master,40,40,40,40,36,40,40,40,40,7,40
Mrs,126,126,126,126,109,126,126,126,126,45,125
Miss,185,185,185,185,149,185,185,185,185,49,184
Mr,517,517,517,517,398,517,517,517,517,93,517


In [12]:
def fill_age(df):
    for title in major_titles:
        avg_age = df[df.Title == title]['Age'].mean()
        df.loc[(df.Title == title) & (df.Age.isnull()), 'Age'] = avg_age
        

In [13]:
fill_age(data_trans)

In [14]:
data_trans['Embarked'].fillna('Q', inplace=True);

In [15]:
data_trans.isnull().sum().sort_values()

Survived      0
Pclass        0
Name          0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Embarked      0
Title         0
Cabin       687
dtype: int64

In [16]:
print(data_trans[data_trans.Pclass == 1]['Fare'].mean())
print(data_trans[data_trans.Pclass == 2]['Fare'].mean())
print(data_trans[data_trans.Pclass == 3]['Fare'].mean())

84.1546875
20.6621831522
13.6755501018


In [17]:
def adjust_tickets(df):
    tickets = df['Ticket'].unique()
    for ticket in tickets:
        ticket_rows = df[df['Ticket'] == ticket]
        same_tickes_num = len(ticket_rows)
        if (same_tickes_num > 1):
            fare = ticket_rows['Fare'].mean()
            new_fare = fare / same_tickes_num
            df.loc[df['Ticket'] == ticket, 'Fare'] = new_fare

In [18]:
adjust_tickets(data_trans)

In [19]:
print(data_trans[data_trans.Pclass == 1]['Fare'].mean())
print(data_trans[data_trans.Pclass == 2]['Fare'].mean())
print(data_trans[data_trans.Pclass == 3]['Fare'].mean())

43.6503472222
13.3225994565
8.08585692464


In [20]:
data_trans['FamilySize'] = data_trans['SibSp'] + data_trans['Parch'] + 1

In [21]:
def one_hot_title(df):
    for title in major_titles:
        df['Is' + title] = (df.Title == title).astype(float)

In [22]:
one_hot_title(data_trans)

In [23]:
def one_hot_embark(df):
    for city in ['S', 'C', 'Q']:
        df['Embarked' + city] = (df.Embarked == city).astype(float)

In [24]:
one_hot_embark(data_trans)

In [25]:
def one_hot_class(df):
    for cls in [1, 2, 3]:
        df['Class' + str(cls)] = (df.Pclass == cls).astype(float)

In [26]:
one_hot_class(data_trans)

In [27]:
def one_hot_sex(df):
    for sex in ['male', 'female']:
        df['Is' + sex.title()] = (df.Sex == sex).astype(float)

In [28]:
one_hot_sex(data_trans)

In [29]:
def one_hot_family(df):
    df['Alone'] = (df.FamilySize == 1).astype(float)
    df['SmallFamily'] = ((df.FamilySize >= 2) & (df.FamilySize < 5)).astype(float)
    df['BigFamily'] = (df.FamilySize >= 5).astype(float)

In [30]:
one_hot_family(data_trans)

In [31]:
def one_hot_age(df):
    df['Child'] = (df.Age <= 12).astype(float)
    df['YoundAdult'] = ((df.Age > 12) & (df.FamilySize <= 25)).astype(float)
    df['Adult'] = ((df.FamilySize > 25) & (df.FamilySize <= 50)).astype(float)
    df['Elderly'] = (df.Age > 50).astype(float)

In [32]:
def data_and_target(df):
    X = df
    X = X.drop('Survived',1)
    y = df['Survived']
    
    print('X shape: {}, y shape {}'.format(X.shape, y.shape))
    
    return (X, y)

In [33]:
one_hot_age(data_trans)

In [34]:
def one_hot_fare(df):
    df['FareCat1'] = (df.Fare <= 7.73).astype(float)
    df['FareCat2'] = ((df.Fare > 7.73) & (df.Fare <= 8.05)).astype(float)
    df['FareCat3'] = ((df.Fare > 8.05) & (df.Fare <= 11.72)).astype(float)
    df['FareCat4'] = ((df.Fare > 11.73) & (df.Fare <= 26.55)).astype(float)
    df['FareCat5'] = (df.Fare > 26.55).astype(float)

In [35]:
# one_hot_fare(data_trans)

In [36]:
data_trans.columns

Index(['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Cabin', 'Embarked', 'Title', 'FamilySize', 'IsMr', 'IsMrs',
       'IsMiss', 'IsMaster', 'IsUnknown', 'EmbarkedS', 'EmbarkedC',
       'EmbarkedQ', 'Class1', 'Class2', 'Class3', 'IsMale', 'IsFemale',
       'Alone', 'SmallFamily', 'BigFamily', 'Child', 'YoundAdult', 'Adult',
       'Elderly'],
      dtype='object')

In [37]:
data_useful = data_trans.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Age'],1)

In [38]:
(X, y) = data_and_target(data_useful)

X shape: (891, 22), y shape (891,)


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12345, stratify=y)

y_train.mean(), y_test.mean()

(0.38323353293413176, 0.38565022421524664)

In [40]:
model = LogisticRegression().fit(X_train, y_train)
print("train score:", model.score(X_train, y_train))
print("test score: ", model.score(X_test, y_test))

train score: 0.811377245509
test score:  0.878923766816


In [41]:
forest = RandomForestClassifier(max_depth=4, n_estimators=50, random_state = 0).fit(X_train, y_train)
print("train score:", forest.score(X_train, y_train))
print("test score: ", forest.score(X_test, y_test))

train score: 0.823353293413
test score:  0.874439461883


In [42]:
search = GridSearchCV(forest, {'n_estimators': [10, 30, 50, 70, 100, 200],
                              'max_depth': [2, 4, 6, 8, 10, 12, 15],
                              #'random_state': [0,1,123, 1234, 12345]
                              })
search.fit(X, y)

pd.DataFrame(search.cv_results_)[['rank_test_score', 'mean_test_score', 'params']].sort_values(by='rank_test_score').head(20)

Unnamed: 0,rank_test_score,mean_test_score,params
6,1,0.833895,"{'max_depth': 4, 'n_estimators': 10}"
8,2,0.832772,"{'max_depth': 4, 'n_estimators': 50}"
11,2,0.832772,"{'max_depth': 4, 'n_estimators': 200}"
9,2,0.832772,"{'max_depth': 4, 'n_estimators': 70}"
7,5,0.83165,"{'max_depth': 4, 'n_estimators': 30}"
10,5,0.83165,"{'max_depth': 4, 'n_estimators': 100}"
12,7,0.830527,"{'max_depth': 6, 'n_estimators': 10}"
14,8,0.823793,"{'max_depth': 6, 'n_estimators': 50}"
13,8,0.823793,"{'max_depth': 6, 'n_estimators': 30}"
17,10,0.817059,"{'max_depth': 6, 'n_estimators': 200}"


In [43]:
test = pd.read_csv('../data/test-titanic.csv', index_col=['PassengerId'])

add_title(test)
edit_titles(test)
fill_age(test)
test['Embarked'].fillna('Q', inplace=True)
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1
test['Fare'] = test['Fare'].fillna(test['Fare'].mean())
adjust_tickets(test)
one_hot_title(test)
one_hot_embark(test)
one_hot_class(test)
one_hot_sex(test)
one_hot_family(test)
one_hot_age(test)
# one_hot_fare(test)
# name_len(test)

test = test.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'Title', 'Age'],1)

In [44]:
predictions = forest.predict(test)
frame = pd.DataFrame({
    'PassengerId': pd.read_csv('../data/test-titanic.csv').PassengerId,
    'Survived': predictions
})
frame = frame.set_index('PassengerId')
frame.to_csv('~/Desktop/pred1.csv')
frame.head()

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1
