# [Titanic - download data from here](https://www.kaggle.com/c/titanic/data)


In [115]:
import pandas as pd
import numpy as np
np.random.seed(2018)

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn import svm,model_selection, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process

import xgboost as xgb

from collections import defaultdict

import matplotlib.pyplot as plt
%matplotlib inline

## Load data

In [116]:
df_train = pd.read_csv('../input/titanic/train.csv')
df_test = pd.read_csv('../input/titanic/test.csv')

# df_all = pd.concat([df_train, df_test])

## Data analysis

In [117]:
print(df_train.shape)
print(df_test.shape)

(891, 12)
(418, 11)


In [118]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [119]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [120]:
df_train.sample(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S
283,284,1,3,"Dorking, Mr. Edward Arthur",male,19.0,0,0,A/5. 10482,8.05,,S


In [121]:
df_train.pivot_table(values=["Survived"], index=["Pclass","Sex"], aggfunc=np.mean)

Unnamed: 0_level_0,Unnamed: 1_level_0,Survived
Pclass,Sex,Unnamed: 2_level_1
1,female,0.968085
1,male,0.368852
2,female,0.921053
2,male,0.157407
3,female,0.5
3,male,0.135447


## First dummy sample - ASAP to kaggle

In [122]:
feats = [ 'Pclass' ]

X = df_train[ feats ].values
y = df_train['Survived'].values

In [123]:
model = ExtraTreesClassifier(n_estimators=100, max_depth=4)
model.fit(X, y)
y_pred = model.predict(X)

score = accuracy_score(y, y_pred)

In [124]:
X = df_test[ feats ].values

predictions = df_test['Survived'] = model.predict(X)

PassengerId = df_test['PassengerId']

submission = pd.DataFrame({ 'PassengerId': PassengerId, 'Survived': predictions })
submission.to_csv("../output/titanic/extra_tree_1_feature.csv", index=False)

print("Our features: %s" % feats)
print("Local Score %.2f" % score)
print("Kaggle Score %.2f" % 0.65)

Our features: ['Pclass']
Local Score 0.68
Kaggle Score 0.65


## Second dummy sample - ASAP to kaggle

In [125]:
feats = df_train.select_dtypes(include=[np.int64]).columns.values
black_list = ['PassengerId', 'Survived'] 

feats = [feat for feat in feats if feat not in black_list]

In [126]:
X = df_train[ feats ].values
y = df_train['Survived'].values

model = ExtraTreesClassifier(n_estimators=100, max_depth=4)
model.fit(X, y)
y_pred = model.predict(X)

score = accuracy_score(y, y_pred)

In [127]:
X = df_test[ feats ].values

predictions = df_test['Survived'] = model.predict(X)

PassengerId = df_test['PassengerId']

submission = pd.DataFrame({ 'PassengerId': PassengerId, 'Survived': predictions })
submission.to_csv("../output/titanic/extra_tree_3_features.csv", index=False)

print("Our features: %s" % feats)
print("Local Score %.2f" % score)
print("Kaggle Score %.2f" % 0.68)

Our features: ['Pclass', 'SibSp', 'Parch']
Local Score 0.71
Kaggle Score 0.68


## Create few helpers

In [128]:
black_list = ['PassengerId', 'Survived']
PassengerId = df_test['PassengerId']


In [129]:
def get_feats(df):
    feats = df.select_dtypes(include=[np.int64]).columns.values
    black_list = ['PassengerId', 'Survived']
    
    return [feat for feat in feats if feat not in black_list]

def train_and_predict():
    feats = get_feats( df_train.select_dtypes(include=[np.int, np.float]).head() )
    print('Our features: %s ' % feats)

    X = df_train[ feats ].values
    y = df_train['Survived'].values

    model = RandomForestClassifier(max_depth=3, n_estimators=100, random_state=1)

    model.fit(X, y)
    y_pred = model.predict(X)
    
    print('Local Score %.2f' %accuracy_score(y, y_pred))
    
    return model
        

def predict_and_submit(model, feats, file_name):
    predictions = df_test[ 'Survived' ] = model.predict( df_test[ feats ].values )

    submission = pd.DataFrame({ 'PassengerId': PassengerId, 'Survived': predictions })
    submission.to_csv( '../output/titanic/' + file_name + '.csv', index=False )

def feature_engineering(df):
    df['name_length'] = df['Name'].apply(len)

    return df

def train_predict_submit(file_name):
    feature_engineering(df_train)
    feature_engineering(df_test)
    
    predict_and_submit( train_and_predict() , get_feats(df_test) , file_name)

In [130]:
train_predict_submit('titanic_73')
print("Kaggle Score %.2f" % 0.69)

Our features: ['Pclass', 'SibSp', 'Parch', 'name_length'] 
Local Score 0.74
Kaggle Score 0.69


In [132]:
def feature_engineering(df):
    df['sex_cat'] = pd.factorize( df['Sex'] )[0]
    df['embarked_cat'] = pd.factorize( df['Embarked'].fillna('C') )[0]
    df['title'] = df['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
    rare_titles = ['Mlle','the Countess','Mme', 'Ms', 'Lady', 'Countess', 'Capt','Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'] 
    df['title'].values[df['title'].isin( rare_titles )] = 'Rare'
    df['title_cat'] = df['title'].factorize()[0]
    

In [134]:
train_predict_submit('titanic_82')
print("Kaggle Score %.2f" % 0.77)

Our features: ['Pclass', 'SibSp', 'Parch', 'name_length', 'sex_cat', 'embarked_cat', 'title_cat'] 
Local Score 0.82
Kaggle Score 0.77


## Feature Engineering

## Family

In [47]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,title_norm,title_cat,title_norm_cat,age_mean,age_median
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,Mr,0,0,32.36809,30.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,Mrs,1,1,35.898148,35.0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,Miss,2,2,21.773973,21.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,Mrs,1,1,35.898148,35.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,Mr,0,0,32.36809,30.0


In [48]:
train['family_size']   = train['SibSp'] + train['Parch'] + 1
train['family_single'] = train['family_size'] == 1
train['family_small']  = (train['family_size'] > 1) & (train['family_size'] < 5)
train['family_big']  = train['family_size'] > 4

train['child'] = train['age_mean'] < 18
train['adult'] = train['age_mean'] >= 18

def is_mother(row):
    if row['Sex'] == 'male': return False
    if row['title_norm_cat'] == 'miss': return False
    if row['age_mean'] < 18: return False
    
    return True

train['is_mother'] = train.apply(is_mother, axis=1)
print("is mother:", train['is_mother'].value_counts())

X, y = get_X_y(['Pclass', 'title_norm_cat', 'SibSp', 'Parch', 'Fare', 'age_mean', 'age_median', 'family_size', 'family_single', 'family_small', 'family_big'])
cross_val_score(DecisionTreeClassifier(max_depth=4, random_state=2018), X, y, scoring='accuracy', cv=3).mean()

is mother: False    577
True     314
Name: is_mother, dtype: int64


0.82379349046015715

In [23]:



## Features
#     df['embarked_cat'] = pd.factorize( df['Embarked'] )[0]
#     df['title'] = df['Name'].map(lambda x: x.split(',')[1].split('.')[0].strip())
#     rare_titles = ['Mlle','the Countess','Mme', 'Ms', 'Lady', 'Countess', 'Capt','Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'] 
#     df['title'].values[df['title'].isin( rare_titles )] = 'Rare'
#     df['title_cat'] = df['title'].factorize()[0]
    
    
    