## load train data

In [None]:
import pandas as pd
passenger_train=pd.read_csv('train.csv')
target_col='Survived'
id_col='PassengerId'
target=passenger_train[target_col]
total_num=len(passenger_train)

## drop too-variant cols and target cols

In [None]:
c=passenger_train.count()
drop_cols=c[c*2<len(passenger_train)].index
drop_cols=list(drop_cols)
drop_cols+=[id_col,target_col,'Name','Ticket']
passenger_train.drop(drop_cols,axis=1,inplace=True)

## explore the data

In [None]:
%matplotlib inline
from matplotlib import pyplot

pyplot.hist(passenger_train['Age'].dropna())
pyplot.show()

In [None]:
%matplotlib inline
from matplotlib import pyplot

pyplot.hist(passenger_train[passenger_train['Fare']<200]['Fare'].dropna())
pyplot.show()

## feature engineering

In [None]:
import sys
sys.path.append('../')
from util import *
from label_binary import LabelBinarizerEx
from df_pipeline import DfPipeline

In [None]:
passenger_train=pd.DataFrame({'id':[1,2,np.nan],'sex':['male','female',np.nan]})
passenger_train.describe()

In [None]:
from sklearn.preprocessing import Imputer,StandardScaler
summary=passenger_train.describe()

num_pipelines=[(c,Pipeline([
    ('select',DataFrameSelecter([c])),
    ('fill',Imputer(strategy='median')),
    ('scale',StandardScaler()),
])) for c in summary.columns]

cat_cols=list(set(passenger_train.columns)-set(summary.columns))
cat_pipelines=[(c, Pipeline([
    ('select',DataFrameSelecter([c])),
    ('encode',LabelBinarizerEx()),
])) for c in cat_cols ]

full_pipeline=DfPipeline(num_pipelines+cat_pipelines)

prepared_passenger_train=full_pipeline.fit_transform(passenger_train)

prepared_passenger_train.head()


## model selection

### run with raw models

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

clfs=[LogisticRegression(),SGDClassifier(), KNeighborsClassifier(), SVC(),GaussianNB(),MLPClassifier(),
      RandomForestClassifier(),GradientBoostingClassifier(),XGBClassifier()]
scores=[cross_val_score(clf,prepared_passenger_train,target,scoring='accuracy',cv=10,n_jobs=-1).mean() for clf in clfs]
%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

labels=[c.__class__.__name__[:3] for c in clfs]
X=np.arange(len(clfs))
bar(X,scores,tick_label=labels,color='rgb')
ylim(0.5,1.0)
show()
sorted(zip(labels,scores),key=lambda x:x[1],reverse=True)[:3]

## test it

In [None]:
passenger_test=pd.read_csv('test.csv')
test_id=passenger_test[id_col]
drop_cols.remove(target_col)
passenger_test.drop(drop_cols,axis=1,inplace=True)
prepared_passenger_test=full_pl.transform(passenger_test)