## load train data

In [None]:
import pandas as pd
passenger_train=pd.read_csv('train.csv')
target_col='Survived'
id_col='PassengerId'
target=passenger_train[target_col]
total_num=len(passenger_train)

## drop too-variant cols and target cols

In [None]:
c=passenger_train.count()
drop_cols=c[c*2<len(passenger_train)].index
drop_cols=list(drop_cols)
drop_cols+=[id_col,target_col,'Name','Ticket']
passenger_train.drop(drop_cols,axis=1,inplace=True)

## explore the data

In [None]:
%matplotlib inline
from matplotlib import pyplot

passenger_train.hist(bins=20,figsize=(20,15))
pyplot.show()

In [None]:
import sys
sys.path.append('../')
from util import *
from label_binary import LabelBinarizerEx
from df_pipeline import DfPipeline
from binning import Binner

## feature engineering

In [None]:
from sklearn.preprocessing import Imputer,StandardScaler

def featuring(df, num_cols, cat_cols, bin_cols):
    num_pipelines=[(c,Pipeline([
        ('select',DataFrameSelecter([c])),
        ('fill',Imputer(strategy='median')),
        ('scale',StandardScaler()),
    ])) for c in num_cols]

    cat_pipelines=[(c, Pipeline([
        ('select',DataFrameSelecter([c])),
        ('encode',LabelBinarizerEx()),
    ])) for c in cat_cols ]
    
    bin_pipelines=[(c[0], Pipeline([
        ('select',DataFrameSelecter([c[0]])),
        ('fill',Imputer(strategy='median')),
        ('bin',c[1]),
    ])) for c in bin_cols ]

    full_pipeline=DfPipeline(num_pipelines+cat_pipelines+bin_pipelines)
#     full_pipeline=DfPipeline(bin_pipelines)

    prepared_df=full_pipeline.fit_transform(df)

    return prepared_df

### train with raw models

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainRawModels(train_data, target):
    clfs=[LogisticRegression(),SGDClassifier(), KNeighborsClassifier(), SVC(),GaussianNB(),MLPClassifier(),
          RandomForestClassifier(),GradientBoostingClassifier(),XGBClassifier()]
    scores=[cross_val_score(clf,prepared_passenger_train,target,scoring='accuracy',cv=10,n_jobs=-1).mean() for clf in clfs]

    labels=[c.__class__.__name__[:3] for c in clfs]
    X=np.arange(len(clfs))
    bar(X,scores,tick_label=labels,color='rgb')
    ylim(0.5,1.0)
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True)[:3])

## just use num features

In [None]:
summary=passenger_train.describe()

prepared_passenger_train = featuring(passenger_train,summary.columns,[])
prepared_passenger_train.head()

In [None]:
trainRawModels(prepared_passenger_train,target)

## binarize category features

In [None]:
summary=passenger_train.describe()
cat_cols=list(set(passenger_train.columns)-set(summary.columns))
prepared_passenger_train = featuring(passenger_train,summary.columns,cat_cols)
prepared_passenger_train.head()

In [None]:
trainRawModels(prepared_passenger_train,target)

### binarize Pclass 

In [None]:
summary=passenger_train.describe()
num_cols=list(summary.columns)
num_cols.remove('Pclass')

passenger_train2=passenger_train.copy()
passenger_train2['Pclass']=passenger_train2['Pclass'].astype(str)

cat_cols=list(set(passenger_train2.columns)-set(num_cols))

prepared_passenger_train=featuring(passenger_train2,num_cols,cat_cols)

prepared_passenger_train.head()


In [None]:
trainRawModels(prepared_passenger_train,target)

## discretize num features

In [None]:
passenger_train.info()

In [None]:
pyplot.hist(np.ceil(np.log10(passenger_train['Fare']+10)))
pyplot.show()

In [None]:
ages=passenger_train['Age'].quantile(np.linspace(0.1,1,7))
ages

In [None]:
fares=passenger_train['Fare'].quantile(np.linspace(0.1,1,5))
fares

In [None]:
bin_cols=[('Age',Binner([14.,35.,50.])),
          ('Fare',Binner([8.,14.,31.,66.])),
         ]
                                      
summary=passenger_train.describe()
num_cols=list(summary.columns)
cat_cols=list(set(passenger_train.columns)-set(num_cols))
[num_cols.remove(c[0]) for c in bin_cols]

prepared_passenger_train = featuring(passenger_train,num_cols,cat_cols,bin_cols)
prepared_passenger_train.head()

In [None]:
trainRawModels(prepared_passenger_train,target)

## test it

In [None]:
passenger_test=pd.read_csv('test.csv')
test_id=passenger_test[id_col]
drop_cols.remove(target_col)
passenger_test.drop(drop_cols,axis=1,inplace=True)
prepared_passenger_test=full_pl.transform(passenger_test)