## background 
- 32% survival
- women,children,upper class more likely survived
- not enough boats

## load utils

In [None]:
import sys
sys.path.append('../')
from util import *
from onehot import LabelBinarizerEx
from pipeline import FeaturePipeline, DataFramePipeline
from binning import Binner
from title import TitleExtractor
from cabin import HasCabin
from ensemble import EnsembleStackClassifierEx
from addcols import AddColumns
from impute import GroupImputer
from alone import IsAlone

## load train data

In [None]:
import pandas as pd
passenger_train=pd.read_csv('train.csv')
target_col='Survived'
id_col='PassengerId'
target=passenger_train[target_col]
total_num=len(passenger_train)

## explore the data

In [None]:
passengers=passenger_train.copy()
passengers.head()

In [None]:
# passenger_train.info()

### describe number features

In [None]:
# passenger_train.describe()

### describe category features

In [None]:
# passenger_train.describe(include=['O'])

### correlations

In [None]:
# passenger_train.corr()

### pivot features

In [None]:
passengers[['Pclass','Survived']].groupby('Pclass').mean().sort_values('Survived',ascending=False)

class 1 has more survivals

In [None]:
passengers[['Sex','Survived']].groupby('Sex').mean().sort_values('Survived',ascending=False)

female survived more than male

In [None]:
passengers[['SibSp','Survived']].groupby('SibSp').mean().sort_values('Survived',ascending=False)

it seems with one or two sps has more survivals

In [None]:
passengers[['Parch','Survived']].groupby('Parch').mean().sort_values('Survived',ascending=False)

In [None]:
passenger_train[['Embarked','Survived']].groupby('Embarked').mean().sort_values('Survived',ascending=False)

## feature engineering

### make new features

family size

In [None]:
passengers['FamilySize']=passengers['SibSp']+passengers['Parch']
passengers[['FamilySize','Survived']].groupby('FamilySize').mean().sort_values('Survived',ascending=False)

In [None]:
b=Binner(strategy=[4])
passengers['BigFamily']=b.transform(passengers[['FamilySize']].values)
passengers[['BigFamily','Survived']].groupby('BigFamily').mean().sort_values('Survived',ascending=False)

is alone

In [None]:
b=Binner(strategy=[1])
passengers['IsAlone']=1-b.transform(passengers[['FamilySize']].values)
passengers[['IsAlone','Survived']].groupby('IsAlone').mean().sort_values('Survived',ascending=False)

### binning age and fare

In [None]:
b=Binner([14.,35.,50.])
passengers['Age']=b.transform(passengers[['Age']].values)

In [None]:
passengers[['Age','Survived']].groupby('Age').mean().sort_values('Survived',ascending=False)

In [None]:
b=Binner([8.,14.,31.,66.])
passengers['Fare']=b.transform(passengers[['Fare']].values)
passengers[['Fare','Survived']].groupby('Fare').mean().sort_values('Survived',ascending=False)

### visualize

In [None]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn as sns
# g = sns.FacetGrid(passenger_train, col='Survived')
# g.map(plt.hist, 'Age', bins=20)
# plt.show()

In [None]:
# g = sns.FacetGrid(passenger_train, col='Survived')
# g.map(plt.hist, 'Pclass', bins=20)
# plt.show()

### drop useless cols
ticket has too many duplicates,

In [None]:
drop_cols=[id_col,target_col,'Ticket',]

### feature pipelines

In [None]:
passenger_train.columns

In [None]:
full_pipeline=DataFramePipeline([
    FeaturePipeline('Name','Title',Pipeline([('title',TitleExtractor())])),
    FeaturePipeline('Title','',Pipeline([('onehot',LabelBinarizerEx(['Title']))])),
    FeaturePipeline('Sex','',Pipeline([('onehot',LabelBinarizerEx(['Sex']))])),
    FeaturePipeline(['Pclass','Sex','Age'],'Age_band',Pipeline([('impute',GroupImputer(['Pclass','Sex','Age'])),
                                                                ('binning',Binner([14.,35.,50.]))
                                              ])),
    FeaturePipeline(['SibSp','Parch'],'FamilySize',Pipeline([('addcols',AddColumns())])),
    FeaturePipeline('FamilySize','IsAlone',Pipeline([('alone',IsAlone())])),
    FeaturePipeline('Fare','Fare_band',Pipeline([('binning',Binner([8.,14.,31.,66.]))])),
    FeaturePipeline('Cabin','Cabin_has',Pipeline([('has',HasCabin())])),
    FeaturePipeline('Embarked','',Pipeline([('onehot',LabelBinarizerEx(['Embarked']))])),
])

prepared_passenger_train=full_pipeline.fit_transform(passenger_train)
prepared_passenger_train.head()

In [None]:
prepared_passenger_train.info()

In [None]:
from sklearn.preprocessing import Imputer,StandardScaler

def featuring(df, num_cols=[], cat_cols=[], bin_cols=[]):
    num_pipelines=[(c,Pipeline([
        ('select',DataFrameSelecter([c])),
        ('fill',Imputer(strategy='median')),
#         ('scale',StandardScaler()),
    ])) for c in num_cols]

    cat_pipelines=[(c, Pipeline([
        ('select',DataFrameSelecter([c])),
        ('encode',LabelBinarizerEx([c])),
    ])) if isinstance(c,str) else c for c in cat_cols ]
    
    bin_pipelines=[(c[0], Pipeline([
        ('select',DataFrameSelecter([c[0]])),
        ('fill',Imputer(strategy='median')),
        ('bin',c[1]),
    ])) for c in bin_cols ]

    full_pipeline=DfPipeline(num_pipelines+cat_pipelines+bin_pipelines)

    prepared_df=full_pipeline.fit_transform(df)

    return prepared_df,full_pipeline

## train

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainRawModels(train_data, target):
    clfs=[LogisticRegression(n_jobs=-1),
          SVC(probability=True),
          RandomForestClassifier(n_jobs=-1),
          GradientBoostingClassifier(),
          AdaBoostClassifier(), 
          ExtraTreesClassifier(n_jobs=-1), 
          XGBClassifier()]
    scores=[cross_val_score(clf,train_data,target,scoring='accuracy',cv=10,n_jobs=-1).mean() for clf in clfs]

    labels=[c.__class__.__name__[:3] for c in clfs]
    X=np.arange(len(clfs))
    bar(X,scores,tick_label=labels,color='rgb')
    ylim(0.5,1.0)
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True))
    return clfs

## fill missing ages according to class

In [None]:
missing_index=passenger_train[passenger_train['Age'].isnull()].index

class_ages=passenger_train.groupby(['Pclass','Sex']).median()['Age']
class_ages
# passenger_train.loc[missing_index,'Age']
# class_ages[tuple([passenger_train.loc[5,'Pclass'],passenger_train.loc[5,'Sex']])]
# for i in missing_ages.index:
#     passenger_train.loc[i,'Age'] = class_ages[passenger_train.iloc[i]['Pclass'],passenger_train.iloc[i]['Sex']]
# passenger_train['Age'].count()
# help(class_ages)

In [None]:
type(class_ages)

In [None]:
class_ages[1,'male']

## binarize category features

In [None]:
cat_cols=list(set(passenger_train.columns)-set(summary.columns))
for c in ['Name','Ticket','Cabin']:
    cat_cols.remove(c)

prepared_passenger_train,full_pipeline = featuring(passenger_train,num_cols,cat_cols)
prepared_passenger_train.head()

In [None]:
# trainRawModels(prepared_passenger_train,target)

## discretize num features

In [None]:
# ages=passenger_train['Age'].quantile(np.linspace(0.1,1,7))
# ages

In [None]:
# fares=passenger_train['Fare'].quantile(np.linspace(0.1,1,5))
# fares

In [None]:
bin_cols=[('Age',Binner([14.,35.,50.])),
          ('Fare',Binner([8.,14.,31.,66.])),
         ]
[num_cols.remove(c[0]) for c in bin_cols]

prepared_passenger_train,full_pipeline = featuring(passenger_train,num_cols,cat_cols,bin_cols)
prepared_passenger_train.head()

In [None]:
# trainRawModels(prepared_passenger_train,target)

### extract new features
extract title from name

In [None]:
# passenger_train2=passenger_train.copy()
# passenger_train2['Title'] = passenger_train2['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# pd.crosstab(passenger_train2['Title'], passenger_train2['Sex'])

In [None]:
l=LabelBinarizerEx(['Title'])
cat_cols.append(('Name',Pipeline([
        ('select',DataFrameSelecter(['Name'])),
        ('extract',TitleExtractor()),
        ('encode',l),
    ])))

prepared_passenger_train,full_pipeline = featuring(passenger_train,num_cols,cat_cols,bin_cols)
prepared_passenger_train.head()

In [None]:
# trainRawModels(prepared_passenger_train,target)

## make new features
sibsp+parch to make IsAlone

In [None]:
# passenger_train2= passenger_train.copy()
# passenger_train2['FamilySize']=passenger_train2['SibSp']+passenger_train2['Parch']+1
# passenger_train2['IsAlone'] = (passenger_train2['FamilySize'] == 1).astype(int)
# passenger_train2.groupby('IsAlone').mean()['Survived']

In [None]:
# passenger_train['FamilySize']=passenger_train['SibSp']+passenger_train['Parch']+1
# passenger_train['IsAlone'] = (passenger_train['FamilySize'] == 1).astype(int)

# prepared_passenger_train = featuring(passenger_train,['Pclass','IsAlone'],['Sex','Embarked'],bin_cols)
# prepared_passenger_train.head()

In [None]:
# trainRawModels(prepared_passenger_train,target)

### add feature HasCabin

In [None]:
cat_cols.append(('Cabin',Pipeline([
        ('select',DataFrameSelecter(['Cabin'])),
        ('extract',HasCabin()),
    ])))

prepared_passenger_train,full_pipeline = featuring(passenger_train,num_cols,cat_cols,bin_cols)
prepared_passenger_train.head()

In [None]:
# clfs=trainRawModels(prepared_passenger_train,target)

### add feature class*age

In [None]:
# prepared_passenger_train['Pclass*Age']=prepared_passenger_train['Pclass']*prepared_passenger_train['Age']
# prepared_passenger_train.head()

In [None]:
# trainRawModels(prepared_passenger_train,target)

In [None]:
# prepared_passenger_train=prepared_passenger_train.drop(['Pclass','Age'],axis=1)
# prepared_passenger_train.head()

In [None]:
# trainRawModels(prepared_passenger_train,target)

In [None]:
# prepared_passenger_train = featuring(passenger_train,num_cols,cat_cols,bin_cols)
# prepared_passenger_train.head()

In [None]:
# trainRawModels(prepared_passenger_train,target)

In [None]:
# prepared_passenger_train.drop('Sex_female',axis=1,inplace=True)

In [None]:
# trainRawModels(prepared_passenger_train,target)

In [None]:
# prepared_passenger_train_surv=prepared_passenger_train.copy()
# prepared_passenger_train_surv['Survived']=target
# prepared_passenger_train_surv.corr()

In [None]:
# passenger_train.corr()

In [None]:
# from sklearn.model_selection import GridSearchCV

# pg={'max_depth':[2,3,4,5,6],'learning_rate':[0.001,0.01,0.05,0.1,0.5],'n_estimators':[50,100,200,300,500]}
# g=GridSearchCV(XGBClassifier(),param_grid=pg,scoring='accuracy',cv=10,n_jobs=-1)
# g.fit(prepared_passenger_train,target)
# print(g.best_score_, g.best_estimator_)

In [None]:
# xgb=XGBClassifier(
#     #learning_rate = 0.02,
#  n_estimators= 2000,
#  max_depth= 4,
#  min_child_weight= 2,
#  #gamma=1,
#  gamma=0.9,                        
#  subsample=0.8,
#  colsample_bytree=0.8,
#  objective= 'binary:logistic',
#  nthread= -1,
#  scale_pos_weight=1)
# scores=cross_val_score(xgb,prepared_passenger_train,target,cv=10,n_jobs=-1,scoring='accuracy')
# scores.mean()

## tuning

In [None]:
clfs=[
#       LogisticRegression(n_jobs=-1),
      SVC(probability=True),
#       RandomForestClassifier(n_jobs=-1),
#       GradientBoostingClassifier(),
#       AdaBoostClassifier(), 
#       ExtraTreesClassifier(n_jobs=-1), 
#       XGBClassifier()
]

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_set=[
#      {'C':[0.01,0.1,0.5,1.]},
                {'C':[0.01,0.1,0.5,1.],'kernel':['rbf','poly','sigmoid'],'gamma':[0.01,0.1,0.5],'degree':[2,3,4],'coef0':[0.01,0.1,1.,10.]},
#                 {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[3,5,8]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300]},
#                 {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[5,10,15],'gamma':[0.01,0.1,0.5]},
               ]

gs=[]
results=[]
# for i in range(len(clfs)):
#     gs.append(GridSearchCV(estimator=clfs[i],param_grid=param_grid_set[i],scoring='accuracy',n_jobs=-1,verbose=1))
#     gs[i].fit(prepared_passenger_train,target)
#     results.append((gs[i].best_estimator_,gs[i].best_score_))
sorted(results,key=lambda x:x[1],reverse=True)


In [None]:
clfs=[g.best_estimator_ for g in gs]

In [None]:
# help(SVC)

## emsembling

### voting

In [None]:
# from sklearn.ensemble import VotingClassifier
# from sklearn.model_selection import cross_val_score

# voter=VotingClassifier(estimators=[(c.__class__.__name__, c) for c in clfs], voting='soft',n_jobs=-1)
# scores=cross_val_score(voter,prepared_passenger_train,target,cv=10,n_jobs=-1,scoring='accuracy')
# scores.mean()

### stacking

In [None]:
# from brew.base import Ensemble
# from brew.stacking import EnsembleStackClassifier,EnsembleStack
# import sklearn

# layer_1 = Ensemble(clfs)
# layer_2 = Ensemble([sklearn.clone(clfs[0])])

# stack = EnsembleStack(cv=len(clfs))

# stack.add_layer(layer_1)
# stack.add_layer(layer_2)

# sclf = EnsembleStackClassifierEx(stack)
# from sklearn.model_selection import train_test_split
# # X_train,y_train,X_test,y_test=train_test_split(prepared_passenger_train.values,target,test_size=0.3,random_state=0)
# # print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
# sclf.fit(prepared_passenger_train.values[:600],target[:600])
# sclf.score(prepared_passenger_train.values[600:],target[600:])
# scores=cross_val_score(sclf,prepared_passenger_train.values,target,cv=5,n_jobs=-1)
# scores.mean()

# sclf.fit(prepared_passenger_train.values[:600],target[:600])

## test it

In [None]:
# passenger_test=pd.read_csv('test.csv')
# test_id=passenger_test[id_col]
# cols=drop_cols.copy()
# cols.remove(target_col)
# passenger_test.drop(cols,axis=1,inplace=True)

In [None]:
# prepared_passenger_test=full_pipeline.transform(passenger_test)
# prepared_passenger_test.head()

In [None]:
# clf=gs[0].best_estimator_
# # clf.fit(prepared_passenger_train,target)
# survived=clf.predict(prepared_passenger_test.values)
# test_Survived = pd.Series(survived, name="Survived")
# results = pd.concat([test_id,test_Survived],axis=1)
# results.to_csv("result.csv",index=False)

In [None]:
# help(AdaBoostClassifier)