## background 
- 32% survival
- women,children,upper class more likely survived
- not enough boats

## load utils

In [None]:
import sys
sys.path.append('../')
from sklearn.preprocessing import StandardScaler,MinMaxScaler,Imputer
from util import *
from onehot import LabelBinarizerEx
from pipeline import FeaturePipeline, DataFramePipeline
from binning import Binner
from title import TitleExtractor
from cabin import HasCabin
from ensemble import EnsembleStackClassifierEx
from addcols import AddColumns
from impute import GroupImputer,MixImputer
from alone import IsAlone
from scipy.stats import boxcox
import pandas as pd



## load train data

In [None]:
passenger_train=pd.read_csv('raw_data/train.csv')
passenger_test=pd.read_csv('raw_data/test.csv')
target_col='Survived'
id_col='PassengerId'
target=passenger_train[target_col]
total_num=len(passenger_train)

In [None]:
passenger_train.head()

In [None]:
passenger_train.describe()

In [None]:
passenger_test.describe()

In [None]:
passenger_train.describe(include=['O'])

In [None]:
passenger_test.describe(include=['O'])

In [None]:
raw_df=pd.concat([passenger_train.drop(target_col,axis=1),passenger_test]).drop(id_col,axis=1)
raw_df.index=np.arange(len(raw_df))
org_cols=raw_df.columns

age_group=raw_df.groupby(['Pclass','Sex']).mean()['Age']
missing_index=raw_df[raw_df.Age.isnull()].index
for i in missing_index:
    raw_df.loc[i,'Age'] = age_group[(raw_df.loc[i,'Pclass'],raw_df.loc[i,'Sex'])]
    
raw_df.Fare=Imputer(strategy='mean').fit_transform(raw_df.Fare.values.reshape(-1,1))

raw_df['FamilySize']=raw_df.SibSp+raw_df.Parch

raw_df['IsAlone']=(raw_df.FamilySize==0).astype('int')

def countCabin(cabin):
    c=0
    if cabin is not np.nan:
        c=len(cabin.split())
    return c

raw_df['Cabins']=raw_df.Cabin.apply(countCabin)

for c in ['Pclass','Age','Fare','SibSp','Parch','FamilySize','Cabins']:
    new_c=c+'_Norm'
    raw_df[new_c]=raw_df[c]
    if raw_df[new_c].min()<=0.:
        raw_df[new_c]=raw_df[new_c]+abs(raw_df[new_c].min())+0.1
    tranformed,_=boxcox(raw_df[new_c])
    raw_df[new_c]=StandardScaler().fit_transform(tranformed.reshape(-1,1))
    raw_df.drop(c,axis=1,inplace=True)
import re

# def extractTitle(name):
#     m = re.search(' \w+\\.',name)
#     if m:
#         return m.group()[1:-1]
#     else:
#         return np.nan
    
def extractTitle(name):
    title=np.nan
    m = re.search(' \w+\\.',name)
    if m:
        t=m.group()[1:-1]
        if t in ['Mr','Miss','Mrs','Master']:
            title = t
    return title
    
raw_df['Title']=raw_df.Name.apply(extractTitle)

# def extractTicketNumber(ticket):
#     try:
#         return float(ticket)
#     except:
#         splits=ticket.split()
#         if len(splits)>1:
#             return float(splits[1])
#         else:
#             return np.nan
        
# raw_df['Ticket_Number']=raw_df.Ticket.apply(extractTicketNumber)

# def extractTicketLocation(ticket):
#     m = re.search('\w+ ',ticket)
#     if m:
#         return m.group()[0]
#     else:
#         return np.nan

# raw_df['Ticket_Location']=raw_df.Ticket.apply(extractTicketLocation)

def isTicketNum(ticket):
    try:
        int(ticket)
        return 1
    except Exception as e:
        return 0
    
# raw_df['IsTicketNum']=raw_df.Ticket.apply(isTicketNum)

raw_df['HasCabin']=(raw_df.Cabin.isnull()==False).astype('int')

raw_df['Embarked_Imp']=MixImputer().fit_transform(raw_df[['Embarked']])

raw_df=pd.get_dummies(raw_df,columns=['Sex','Title','Embarked_Imp'])

pre_df=raw_df.drop(['Name','Ticket','Cabin','Embarked'],axis=1)
pre_df.columns

In [None]:
train_data=pre_df[:len(passenger_train)]
test_data=pre_df[len(passenger_train):]
len(train_data)==len(passenger_train)

In [None]:
train_data.describe()

In [None]:
tmp=train_data.copy()
tmp['Survived']=target
tmp.corr()['Survived'].sort_values(ascending=False)

## train

In [None]:
hill_result={}
from imp import reload
from aml import auto_model_machine as aml

In [None]:
TASK_NAME='hill-lr-rf'
logger,handlers=initLogging(TASK_NAME)

aml=reload(aml)
bc=aml.BinaryClassifier.hillclimbing(train_data,target,('lr','rf','gbt'),hill_result,logger=logger,cv=5)
resetLogging(logger,handlers)

In [None]:
resetLogging(logger,handlers)

In [None]:
bc.select(5)

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier,LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

base_clfs=[
    bc,
    bc.get_nth_model(0),
#     bc.get_nth_model(1),
#     SVC(probability=True,random_state=42,C=10.0,gamma=0.01,coef0=1.,degree=3,kernel='poly'),
#     GradientBoostingClassifier(learning_rate=0.01,max_depth=5,max_features=None, n_estimators=100)
#     LogisticRegression(random_state=42),
#       SVC(probability=True,random_state=42),
#       RandomForestClassifier(random_state=42),
#       GradientBoostingClassifier(random_state=42),
#       AdaBoostClassifier(), 
#       ExtraTreesClassifier(n_jobs=-1), 
#       XGBClassifier(),
]
from sklearn.model_selection import cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainModels(train_data, target):
    scores=[cross_val_score(clf,train_data,target,scoring='accuracy',cv=4).mean() for clf in base_clfs]

    labels=[c.__class__.__name__[:3] for c in base_clfs]
    X=np.arange(len(base_clfs))
    bar(X,scores,tick_label=labels,color='rgb')
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True))
    
trainModels(train_data,target)

In [None]:
bc.get_nth_model(0)
# base_clfs[2]

In [None]:
# TASK_NAME='gbt-001-3-None-1000-08'
# clf=GradientBoostingClassifier(learning_rate=0.01,max_depth=5,)
# clf.fit(train_data,target)
# bc.select(0)
clf=bc
# clf=base_clfs[2]
# clf=bc.get_nth_model(0)
# clf=GradientBoostingClassifier(learning_rate=0.1,max_depth=3,max_features=None,n_estimators=500,subsample=0.8)
# clf=GradientBoostingClassifier(n_estimators=500)
clf.fit(train_data,target)


In [None]:
# bc.fit_one(2,train_data.values,target)
test_id=passenger_test[id_col]
# clf=base_clfs[0]
survived=clf.predict(test_data)
test_Survived = pd.Series(survived, name="Survived").astype(int)
results = pd.concat([test_id,test_Survived],axis=1)
results.to_csv("output/%s.csv"%TASK_NAME,index=False)