## background 
- 32% survival
- women,children,upper class more likely survived
- not enough boats

## load train data

In [1]:
import pandas as pd
passenger_train=pd.read_csv('train.csv')
target_col='Survived'
id_col='PassengerId'
target=passenger_train[target_col]
total_num=len(passenger_train)

## explore the data

In [2]:
# passenger_train.head()

In [3]:
# passenger_train.info()

### describe number features

In [4]:
# passenger_train.describe()

### describe category features

In [5]:
# passenger_train.describe(include=['O'])

### correlations

In [6]:
# passenger_train.corr()

### pivot features

In [7]:
# passenger_train[['Pclass','Survived']].groupby('Pclass').mean().sort_values('Survived',ascending=False)

In [8]:
# passenger_train[['Sex','Survived']].groupby('Sex').mean().sort_values('Survived',ascending=False)

### visualize

In [9]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn as sns
# g = sns.FacetGrid(passenger_train, col='Survived')
# g.map(plt.hist, 'Age', bins=20)
# plt.show()

In [10]:
# g = sns.FacetGrid(passenger_train, col='Survived')
# g.map(plt.hist, 'Pclass', bins=20)
# plt.show()

### drop useless cols
ticket has too many duplicates,

In [11]:
drop_cols=[id_col,target_col,'Ticket',]

## load utils

In [12]:
import sys
sys.path.append('../')
from util import *
from label_binary import LabelBinarizerEx
from df_pipeline import DfPipeline
from binning import Binner
from title import TitleExtractor
from cabin import HasCabin
from ensemble import EnsembleStackClassifierEx



## feature engineering

In [13]:
from sklearn.preprocessing import Imputer,StandardScaler

def featuring(df, num_cols=[], cat_cols=[], bin_cols=[]):
    num_pipelines=[(c,Pipeline([
        ('select',DataFrameSelecter([c])),
        ('fill',Imputer(strategy='median')),
#         ('scale',StandardScaler()),
    ])) for c in num_cols]

    cat_pipelines=[(c, Pipeline([
        ('select',DataFrameSelecter([c])),
        ('encode',LabelBinarizerEx([c])),
    ])) if isinstance(c,str) else c for c in cat_cols ]
    
    bin_pipelines=[(c[0], Pipeline([
        ('select',DataFrameSelecter([c[0]])),
        ('fill',Imputer(strategy='median')),
        ('bin',c[1]),
    ])) for c in bin_cols ]

    full_pipeline=DfPipeline(num_pipelines+cat_pipelines+bin_pipelines)

    prepared_df=full_pipeline.fit_transform(df)

    return prepared_df,full_pipeline

### train with raw models

In [14]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainRawModels(train_data, target):
    clfs=[LogisticRegression(n_jobs=-1),
          SVC(probability=True),
          RandomForestClassifier(n_jobs=-1),
          GradientBoostingClassifier(),
          AdaBoostClassifier(), 
          ExtraTreesClassifier(n_jobs=-1), 
          XGBClassifier()]
    scores=[cross_val_score(clf,train_data,target,scoring='accuracy',cv=10,n_jobs=-1).mean() for clf in clfs]

    labels=[c.__class__.__name__[:3] for c in clfs]
    X=np.arange(len(clfs))
    bar(X,scores,tick_label=labels,color='rgb')
    ylim(0.5,1.0)
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True))
    return clfs

## fill missing ages according to class

In [15]:
missing_ages=passenger_train[passenger_train['Age'].isnull()]
class_ages=passenger_train.groupby(['Pclass','Sex']).median()['Age']
for i in missing_ages.index:
    passenger_train.loc[i,'Age'] = class_ages[passenger_train.iloc[i]['Pclass'],passenger_train.iloc[i]['Sex']]
passenger_train['Age'].count()

891

## just use num features

In [16]:
summary=passenger_train.describe()
num_cols=list(summary.columns)
for c in drop_cols:
    if c in num_cols:
        num_cols.remove(c) 

prepared_passenger_train,full_pipeline = featuring(passenger_train,num_cols)
prepared_passenger_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3.0,22.0,1.0,0.0,7.25
1,1.0,38.0,1.0,0.0,71.2833
2,3.0,26.0,0.0,0.0,7.925
3,1.0,35.0,1.0,0.0,53.1
4,3.0,35.0,0.0,0.0,8.05


In [17]:
# trainRawModels(prepared_passenger_train,target)

## binarize category features

In [18]:
cat_cols=list(set(passenger_train.columns)-set(summary.columns))
for c in ['Name','Ticket','Cabin']:
    cat_cols.remove(c)

prepared_passenger_train,full_pipeline = featuring(passenger_train,num_cols,cat_cols)
prepared_passenger_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,3.0,22.0,1.0,0.0,7.25,0.0,1.0,0.0,0.0,1.0
1,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3.0,26.0,0.0,0.0,7.925,1.0,0.0,0.0,0.0,1.0
3,1.0,35.0,1.0,0.0,53.1,1.0,0.0,0.0,0.0,1.0
4,3.0,35.0,0.0,0.0,8.05,0.0,1.0,0.0,0.0,1.0


In [19]:
# trainRawModels(prepared_passenger_train,target)

## discretize num features

In [20]:
# ages=passenger_train['Age'].quantile(np.linspace(0.1,1,7))
# ages

In [21]:
# fares=passenger_train['Fare'].quantile(np.linspace(0.1,1,5))
# fares

In [22]:
bin_cols=[('Age',Binner([14.,35.,50.])),
          ('Fare',Binner([8.,14.,31.,66.])),
         ]
[num_cols.remove(c[0]) for c in bin_cols]

prepared_passenger_train,full_pipeline = featuring(passenger_train,num_cols,cat_cols,bin_cols)
prepared_passenger_train.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Age,Fare
0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,2.0,4.0
2,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,3.0
4,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,1.0


In [23]:
# trainRawModels(prepared_passenger_train,target)

### extract new features
extract title from name

In [24]:
# passenger_train2=passenger_train.copy()
# passenger_train2['Title'] = passenger_train2['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# pd.crosstab(passenger_train2['Title'], passenger_train2['Sex'])

In [25]:
l=LabelBinarizerEx(['Title'])
cat_cols.append(('Name',Pipeline([
        ('select',DataFrameSelecter(['Name'])),
        ('extract',TitleExtractor()),
        ('encode',l),
    ])))

prepared_passenger_train,full_pipeline = featuring(passenger_train,num_cols,cat_cols,bin_cols)
prepared_passenger_train.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_RareOrNone,Age,Fare
0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,4.0
2,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,3.0
4,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0


In [26]:
# trainRawModels(prepared_passenger_train,target)

## make new features
sibsp+parch to make IsAlone

In [27]:
# passenger_train2= passenger_train.copy()
# passenger_train2['FamilySize']=passenger_train2['SibSp']+passenger_train2['Parch']+1
# passenger_train2['IsAlone'] = (passenger_train2['FamilySize'] == 1).astype(int)
# passenger_train2.groupby('IsAlone').mean()['Survived']

In [28]:
# passenger_train['FamilySize']=passenger_train['SibSp']+passenger_train['Parch']+1
# passenger_train['IsAlone'] = (passenger_train['FamilySize'] == 1).astype(int)

# prepared_passenger_train = featuring(passenger_train,['Pclass','IsAlone'],['Sex','Embarked'],bin_cols)
# prepared_passenger_train.head()

In [29]:
# trainRawModels(prepared_passenger_train,target)

### add feature HasCabin

In [30]:
cat_cols.append(('Cabin',Pipeline([
        ('select',DataFrameSelecter(['Cabin'])),
        ('extract',HasCabin()),
    ])))

prepared_passenger_train,full_pipeline = featuring(passenger_train,num_cols,cat_cols,bin_cols)
prepared_passenger_train.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_RareOrNone,Cabin,Age,Fare
0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,4.0
2,3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,3.0
4,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0


In [31]:
# clfs=trainRawModels(prepared_passenger_train,target)

### add feature class*age

In [32]:
# prepared_passenger_train['Pclass*Age']=prepared_passenger_train['Pclass']*prepared_passenger_train['Age']
# prepared_passenger_train.head()

In [33]:
# trainRawModels(prepared_passenger_train,target)

In [34]:
# prepared_passenger_train=prepared_passenger_train.drop(['Pclass','Age'],axis=1)
# prepared_passenger_train.head()

In [35]:
# trainRawModels(prepared_passenger_train,target)

In [36]:
# prepared_passenger_train = featuring(passenger_train,num_cols,cat_cols,bin_cols)
# prepared_passenger_train.head()

In [37]:
# trainRawModels(prepared_passenger_train,target)

In [38]:
# prepared_passenger_train.drop('Sex_female',axis=1,inplace=True)

In [39]:
# trainRawModels(prepared_passenger_train,target)

In [40]:
# prepared_passenger_train_surv=prepared_passenger_train.copy()
# prepared_passenger_train_surv['Survived']=target
# prepared_passenger_train_surv.corr()

In [41]:
# passenger_train.corr()

In [42]:
# from sklearn.model_selection import GridSearchCV

# pg={'max_depth':[2,3,4,5,6],'learning_rate':[0.001,0.01,0.05,0.1,0.5],'n_estimators':[50,100,200,300,500]}
# g=GridSearchCV(XGBClassifier(),param_grid=pg,scoring='accuracy',cv=10,n_jobs=-1)
# g.fit(prepared_passenger_train,target)
# print(g.best_score_, g.best_estimator_)

In [43]:
# xgb=XGBClassifier(
#     #learning_rate = 0.02,
#  n_estimators= 2000,
#  max_depth= 4,
#  min_child_weight= 2,
#  #gamma=1,
#  gamma=0.9,                        
#  subsample=0.8,
#  colsample_bytree=0.8,
#  objective= 'binary:logistic',
#  nthread= -1,
#  scale_pos_weight=1)
# scores=cross_val_score(xgb,prepared_passenger_train,target,cv=10,n_jobs=-1,scoring='accuracy')
# scores.mean()

## tuning

In [44]:
clfs=[
#       LogisticRegression(n_jobs=-1),
      SVC(probability=True),
#       RandomForestClassifier(n_jobs=-1),
#       GradientBoostingClassifier(),
#       AdaBoostClassifier(), 
#       ExtraTreesClassifier(n_jobs=-1), 
#       XGBClassifier()
]

In [54]:
from sklearn.model_selection import GridSearchCV

param_grid_set=[
#      {'C':[0.01,0.1,0.5,1.]},
                {'C':[0.01,0.1,0.5,1.],'kernel':['rbf','poly','sigmoid'],'gamma':[0.01,0.1,0.5],'degree':[2,3,4],'coef0':[0.01,0.1,1.,10.]},
#                 {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[3,5,8]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300]},
#                 {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[5,10,15],'gamma':[0.01,0.1,0.5]},
               ]

gs=[]
results=[]
for i in range(len(clfs)):
    gs.append(GridSearchCV(estimator=clfs[i],param_grid=param_grid_set[i],scoring='accuracy',n_jobs=-1,verbose=1))
    gs[i].fit(prepared_passenger_train,target)
    results.append((gs[i].best_estimator_,gs[i].best_score_))
sorted(results,key=lambda x:x[1],reverse=True)


Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 637 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 1125 tasks      | elapsed:   47.6s
[Parallel(n_jobs=-1)]: Done 1289 out of 1296 | elapsed:  1.0min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  1.1min finished


[(SVC(C=0.5, cache_size=200, class_weight=None, coef0=1.0,
    decision_function_shape=None, degree=3, gamma=0.1, kernel='poly',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False), 0.83277216610549942)]

In [46]:
clfs=[g.best_estimator_ for g in gs]

In [53]:
help(SVC)

Help on class SVC in module sklearn.svm.classes:

class SVC(sklearn.svm.base.BaseSVC)
 |  C-Support Vector Classification.
 |  
 |  The implementation is based on libsvm. The fit time complexity
 |  is more than quadratic with the number of samples which makes it hard
 |  to scale to dataset with more than a couple of 10000 samples.
 |  
 |  The multiclass support is handled according to a one-vs-one scheme.
 |  
 |  For details on the precise mathematical formulation of the provided
 |  kernel functions and how `gamma`, `coef0` and `degree` affect each
 |  other, see the corresponding section in the narrative documentation:
 |  :ref:`svm_kernels`.
 |  
 |  Read more in the :ref:`User Guide <svm_classification>`.
 |  
 |  Parameters
 |  ----------
 |  C : float, optional (default=1.0)
 |      Penalty parameter C of the error term.
 |  
 |  kernel : string, optional (default='rbf')
 |       Specifies the kernel type to be used in the algorithm.
 |       It must be one of 'linear', 'poly

## emsembling

### voting

In [47]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

voter=VotingClassifier(estimators=[(c.__class__.__name__, c) for c in clfs], voting='soft',n_jobs=-1)
scores=cross_val_score(voter,prepared_passenger_train,target,cv=10,n_jobs=-1,scoring='accuracy')
scores.mean()

0.83052065599818403

### stacking

In [56]:
from brew.base import Ensemble
from brew.stacking import EnsembleStackClassifier,EnsembleStack
import sklearn

layer_1 = Ensemble(clfs)
layer_2 = Ensemble([sklearn.clone(clfs[0])])

stack = EnsembleStack(cv=len(clfs))

stack.add_layer(layer_1)
stack.add_layer(layer_2)

sclf = EnsembleStackClassifierEx(stack)
from sklearn.model_selection import train_test_split
# X_train,y_train,X_test,y_test=train_test_split(prepared_passenger_train.values,target,test_size=0.3,random_state=0)
# print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
sclf.fit(prepared_passenger_train.values[:600],target[:600])
sclf.score(prepared_passenger_train.values[600:],target[600:])
# scores=cross_val_score(sclf,prepared_passenger_train.values,target,cv=5,n_jobs=-1)
# scores.mean()

# sclf.fit(prepared_passenger_train.values[:600],target[:600])

0.8384879725085911

## test it

In [49]:
passenger_test=pd.read_csv('test.csv')
test_id=passenger_test[id_col]
cols=drop_cols.copy()
cols.remove(target_col)
passenger_test.drop(cols,axis=1,inplace=True)

In [50]:
prepared_passenger_test=full_pipeline.transform(passenger_test)
prepared_passenger_test.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_RareOrNone,Cabin,Age,Fare
0,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,3.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0
2,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,1.0
3,3.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
4,3.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0


In [55]:
clf=gs[0].best_estimator_
# clf.fit(prepared_passenger_train,target)
survived=clf.predict(prepared_passenger_test.values)
test_Survived = pd.Series(survived, name="Survived")
results = pd.concat([test_id,test_Survived],axis=1)
results.to_csv("result.csv",index=False)

In [None]:
help(AdaBoostClassifier)