## background 
- 32% survival
- women,children,upper class more likely survived
- not enough boats

## load utils

In [None]:
import sys
sys.path.append('../')
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from util import *
from onehot import LabelBinarizerEx
from pipeline import FeaturePipeline, DataFramePipeline
from binning import Binner
from title import TitleExtractor
from cabin import HasCabin
from ensemble import EnsembleStackClassifierEx
from addcols import AddColumns
from impute import GroupImputer
from alone import IsAlone

## load train data

In [None]:
import pandas as pd
passenger_train=pd.read_csv('train.csv')
target_col='Survived'
id_col='PassengerId'
target=passenger_train[target_col]
total_num=len(passenger_train)

### extract title from name

In [None]:
# passenger_train2=passenger_train.copy()
# passenger_train2['Title'] = passenger_train2['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# pd.crosstab(passenger_train2['Title'], passenger_train2['Sex'])

### feature pipelines

In [None]:
passenger_train.columns

In [None]:
full_pipeline=DataFramePipeline([
    FeaturePipeline('Pclass','Pclass',Pipeline([('scale',StandardScaler())])),
    FeaturePipeline('Name','Title',Pipeline([('title',TitleExtractor())])),
    FeaturePipeline('Title','',Pipeline([('onehot',LabelBinarizerEx(['Title']))])),
    FeaturePipeline('Sex','',Pipeline([('onehot',LabelBinarizerEx(['Sex']))])),
    FeaturePipeline(['Pclass','Sex','Age'],'Age_band',Pipeline([('impute',GroupImputer(['Pclass','Sex','Age'])),
                                                                ('binning',Binner([14.,35.,50.])),
                                                                ('scale',StandardScaler()),
                                              ])),
    FeaturePipeline(['SibSp','Parch'],'FamilySize',Pipeline([('addcols',AddColumns())])),
    FeaturePipeline('FamilySize','BigFamily',Pipeline([('binning',Binner([4]))])),
    FeaturePipeline('FamilySize','IsAlone',Pipeline([('alone',IsAlone())])),
    FeaturePipeline('SibSp','SibSp',Pipeline([('scale',StandardScaler())])),
    FeaturePipeline('Parch','Parch',Pipeline([('scale',StandardScaler())])),
    FeaturePipeline('FamilySize','FamilySize',Pipeline([('scale',StandardScaler())])),
    FeaturePipeline('Fare','Fare_band',Pipeline([('binning',Binner([8.,14.,31.,66.])),
                                                 ('scale',StandardScaler()),])),
    FeaturePipeline('Cabin','Cabin_has',Pipeline([('has',HasCabin())])),
    FeaturePipeline('Embarked','',Pipeline([('onehot',LabelBinarizerEx(['Embarked']))])),
])

prepared_passenger_train=full_pipeline.fit_transform(passenger_train)
# prepared_passenger_train.head()

In [None]:
train_data=prepared_passenger_train.drop(['Survived','PassengerId','Fare','Name','Ticket','Cabin','Embarked','Title','Sex','Age'],axis=1)
# train_data.describe()

## train

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier,LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

base_clfs=[
      LogisticRegression(random_state=42),
#       SVC(probability=True),
      RandomForestClassifier(random_state=42),
#       GradientBoostingClassifier(random_state=0),
#       AdaBoostClassifier(), 
#       ExtraTreesClassifier(n_jobs=-1), 
#       XGBClassifier(),
]
from sklearn.model_selection import cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainModels(train_data, target):
    scores=[cross_val_score(clf,train_data,target,scoring='accuracy',cv=5).mean() for clf in base_clfs]

    labels=[c.__class__.__name__[:3] for c in base_clfs]
    X=np.arange(len(base_clfs))
    bar(X,scores,tick_label=labels,color='rgb')
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True))
    
trainModels(train_data,target)

In [None]:
base_clfs[1].fit(train_data,target)

In [None]:
from aml.auto_model_machine import BinaryClassifier

bc=BinaryClassifier()
bc.fit(train_data.values,target)


In [None]:
# bc.fit_one(4,train_data.values,target)

In [None]:
passenger_test=pd.read_csv('test.csv')
test_id=passenger_test[id_col]
prepared_passenger_test=full_pipeline.transform(passenger_test)
prepared_passenger_test.drop(['PassengerId','Fare','Name','Ticket','Cabin','Embarked','Title','Sex','Age'],axis=1,inplace=True)


In [None]:
# clf=base_clfs[1]
clf=bc
survived=clf.predict(prepared_passenger_test.values)
test_Survived = pd.Series(survived, name="Survived").astype(int)
results = pd.concat([test_id,test_Survived],axis=1)
results.to_csv("result.csv",index=False)