## background 
- 32% survival
- women,children,upper class more likely survived
- not enough boats

## load utils

In [None]:
import sys
sys.path.append('../')
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from util import *
from onehot import LabelBinarizerEx
from pipeline import FeaturePipeline, DataFramePipeline
from binning import Binner
from title import TitleExtractor
from cabin import HasCabin
from ensemble import EnsembleStackClassifierEx
from addcols import AddColumns
from impute import GroupImputer
from alone import IsAlone

## load train data

In [None]:
import pandas as pd
passenger_train=pd.read_csv('train.csv')
target_col='Survived'
id_col='PassengerId'
target=passenger_train[target_col]
total_num=len(passenger_train)

## explore the data

In [None]:
passengers=passenger_train.copy()
# passengers.head()

In [None]:
# passenger_train.info()

### describe number features

In [None]:
# passenger_train.describe()

### describe category features

In [None]:
# passenger_train.describe(include=['O'])

### correlations

In [None]:
# passenger_train.corr()

### pivot features

In [None]:
# passengers[['Pclass','Survived']].groupby('Pclass').mean().sort_values('Survived',ascending=False)

class 1 has more survivals

In [None]:
# passengers[['Sex','Survived']].groupby('Sex').mean().sort_values('Survived',ascending=False)

female survived more than male

In [None]:
# passengers[['SibSp','Survived']].groupby('SibSp').mean().sort_values('Survived',ascending=False)

it seems with one or two sps has more survivals

In [None]:
# passengers[['Parch','Survived']].groupby('Parch').mean().sort_values('Survived',ascending=False)

In [None]:
# passenger_train[['Embarked','Survived']].groupby('Embarked').mean().sort_values('Survived',ascending=False)

## feature engineering

### extract title from name

In [None]:
# passenger_train2=passenger_train.copy()
# passenger_train2['Title'] = passenger_train2['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# pd.crosstab(passenger_train2['Title'], passenger_train2['Sex'])

### family size

In [None]:
passengers['FamilySize']=passengers['SibSp']+passengers['Parch']
# passengers[['FamilySize','Survived']].groupby('FamilySize').mean().sort_values('Survived',ascending=False)

In [None]:
b=Binner(strategy=[4])
passengers['BigFamily']=b.transform(passengers[['FamilySize']].values)
# passengers[['BigFamily','Survived']].groupby('BigFamily').mean().sort_values('Survived',ascending=False)

is alone

In [None]:
b=Binner(strategy=[1])
passengers['IsAlone']=1-b.transform(passengers[['FamilySize']].values)
# passengers[['IsAlone','Survived']].groupby('IsAlone').mean().sort_values('Survived',ascending=False)

### binning age and fare

In [None]:
# ages=passenger_train['Age'].quantile(np.linspace(0.1,1,7))
# ages

In [None]:
# fares=passenger_train['Fare'].quantile(np.linspace(0.1,1,5))
# fares

In [None]:
b=Binner([14.,35.,50.])
passengers['Age']=b.transform(passengers[['Age']].values)

In [None]:
# passengers[['Age','Survived']].groupby('Age').mean().sort_values('Survived',ascending=False)

In [None]:
b=Binner([8.,14.,31.,66.])
passengers['Fare']=b.transform(passengers[['Fare']].values)
# passengers[['Fare','Survived']].groupby('Fare').mean().sort_values('Survived',ascending=False)

### visualize

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn as sns
# g = sns.FacetGrid(passenger_train, col='Survived')
# g.map(plt.hist, 'Age', bins=20)
# plt.show()

In [None]:
# g = sns.FacetGrid(passenger_train, col='Survived')
# g.map(plt.hist, 'Pclass', bins=20)
# plt.show()

In [None]:
# s=StandardScaler()
# s.fit_transform([0,1,2,3,4,6])

### feature pipelines

In [None]:
passenger_train.columns

In [None]:
full_pipeline=DataFramePipeline([
    FeaturePipeline('Pclass','Pclass',Pipeline([('scale',StandardScaler())])),
    FeaturePipeline('Name','Title',Pipeline([('title',TitleExtractor())])),
    FeaturePipeline('Title','',Pipeline([('onehot',LabelBinarizerEx(['Title']))])),
    FeaturePipeline('Sex','',Pipeline([('onehot',LabelBinarizerEx(['Sex']))])),
    FeaturePipeline(['Pclass','Sex','Age'],'Age_band',Pipeline([('impute',GroupImputer(['Pclass','Sex','Age'])),
                                                                ('binning',Binner([14.,35.,50.])),
                                                                ('scale',StandardScaler()),
                                              ])),
    FeaturePipeline(['SibSp','Parch'],'FamilySize',Pipeline([('addcols',AddColumns())])),
    FeaturePipeline('FamilySize','BigFamily',Pipeline([('binning',Binner([4]))])),
    FeaturePipeline('FamilySize','IsAlone',Pipeline([('alone',IsAlone())])),
    FeaturePipeline('SibSp','SibSp',Pipeline([('scale',StandardScaler())])),
    FeaturePipeline('Parch','Parch',Pipeline([('scale',StandardScaler())])),
    FeaturePipeline('FamilySize','FamilySize',Pipeline([('scale',StandardScaler())])),
    FeaturePipeline('Fare','Fare_band',Pipeline([('binning',Binner([8.,14.,31.,66.])),
                                                 ('scale',StandardScaler()),])),
    FeaturePipeline('Cabin','Cabin_has',Pipeline([('has',HasCabin())])),
    FeaturePipeline('Embarked','',Pipeline([('onehot',LabelBinarizerEx(['Embarked']))])),
])

prepared_passenger_train=full_pipeline.fit_transform(passenger_train)
# prepared_passenger_train.head()

In [None]:
train_data=prepared_passenger_train.drop(['Survived','PassengerId','Fare','Name','Ticket','Cabin','Embarked','Title','Sex','Age'],axis=1)
# train_data.describe()

## train

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier,LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

base_clfs=[
      LogisticRegression(random_state=0),
      SVC(probability=True),
      RandomForestClassifier(random_state=0),
      GradientBoostingClassifier(random_state=0),
#       AdaBoostClassifier(), 
#       ExtraTreesClassifier(n_jobs=-1), 
#       XGBClassifier(),
]
from sklearn.model_selection import cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainModels(train_data, target):
    scores=[cross_val_score(clf,train_data,target,scoring='accuracy',cv=5).mean() for clf in base_clfs]

    labels=[c.__class__.__name__[:3] for c in base_clfs]
    X=np.arange(len(base_clfs))
    bar(X,scores,tick_label=labels,color='rgb')
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True))
    
trainModels(train_data,target)

In [None]:
from aml.auto_model_machine import AutoClassifier

ac=AutoClassifier()
ac.fit(train_data,target)
ac.fit_score()

### drop some features

In [None]:
train_data2=train_data.drop(['Embarked_Q','FamilySize','Title_RareOrNone','Age_band','SibSp','Parch'],axis=1)
# trainModels(train_data2,target)

## tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_set=[
                {'C':[0.01,0.1,0.5,1.]},
                {'C':[1.,10.,],'kernel':['rbf','poly'],'gamma':[0.01,0.1,1.],'coef0':[1.,10.,]},
                {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
                {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[3,5,8]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300]},
#                 {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
                {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[5,10,15],'gamma':[0.01,0.1,0.5]},
               ]

def tuneModels(train_data,target):
    results=[]
    for i in range(len(base_clfs)):
        gs=GridSearchCV(estimator=base_clfs[i],param_grid=param_grid_set[i],scoring='accuracy',n_jobs=-1,verbose=1,cv=5)
        gs.fit(train_data,target)
        results.append((gs.best_estimator_,gs.best_score_))
    print(sorted(results,key=lambda x:x[1],reverse=True))
    return results


In [None]:
# results=tuneModels(train_data,target)
# best_clfs=[r[0] for r in results]

In [None]:
# tuneModels(train_data2,target)

## emsembling

### voting

In [None]:
# from sklearn.ensemble import VotingClassifier
# from sklearn.model_selection import cross_val_score

# voter=VotingClassifier(estimators=[(c.__class__.__name__, c) for c in clfs], voting='soft',n_jobs=-1)
# scores=cross_val_score(voter,prepared_passenger_train,target,cv=10,n_jobs=-1,scoring='accuracy')
# scores.mean()

### stacking

In [None]:
clfs=base_clfs[:4]

In [None]:
from brew.base import Ensemble
from brew.stacking import EnsembleStackClassifier,EnsembleStack
import sklearn

layer_1 = Ensemble(clfs)
layer_2 = Ensemble([sklearn.clone(clfs[0])])

stack = EnsembleStack(cv=len(clfs[:3]))

stack.add_layer(layer_1)
stack.add_layer(layer_2)

# sclf = EnsembleStackClassifierEx(stack)
# sclf.fit(train_data.values[:700],target[:700])
# sclf.score(train_data.values[700:],target[700:])

## test it

In [None]:
# passenger_test=pd.read_csv('test.csv')
# test_id=passenger_test[id_col]
# prepared_passenger_test=full_pipeline.transform(passenger_test)
# prepared_passenger_test.drop(['PassengerId','Fare','Name','Ticket','Cabin','Embarked','Title','Sex','Age'],axis=1,inplace=True)
# prepared_passenger_test.head()

In [None]:
# clf=sclf
# # clf.fit(prepared_passenger_train,target)
# survived=clf.predict(prepared_passenger_test.values)
# test_Survived = pd.Series(survived, name="Survived").astype(int)
# results = pd.concat([test_id,test_Survived],axis=1)
# results.to_csv("result.csv",index=False)

In [None]:
# help(AdaBoostClassifier)