## load train data

In [None]:
import pandas as pd
passenger_train=pd.read_csv('train.csv')
target_col='Survived'
id_col='PassengerId'
target=passenger_train[target_col]
total_num=len(passenger_train)

## drop too-variant cols and target cols

In [None]:
c=passenger_train.count()
drop_cols=c[c*2<len(passenger_train)].index
drop_cols=list(drop_cols)
drop_cols+=[id_col,target_col,'Name','Ticket']
passenger_train.drop(drop_cols,axis=1,inplace=True)

## explore the data

In [None]:
%matplotlib inline
from matplotlib import pyplot

pyplot.hist(passenger_train['Age'].dropna())
pyplot.show()

In [None]:
%matplotlib inline
from matplotlib import pyplot

pyplot.hist(passenger_train[passenger_train['Fare']<200]['Fare'].dropna())
pyplot.show()

## feature engineering

In [None]:
import sys
sys.path.append('../')
from util import *
from label_binary import LabelBinarizerEx
from df_pipeline import DfPipeline

In [None]:
passenger_train=pd.DataFrame({'id':[1,2,np.nan],'sex':['male','female',np.nan]})
passenger_train.describe()

In [None]:
from sklearn.preprocessing import Imputer,StandardScaler
summary=passenger_train.describe()

num_pipelines=[(c,Pipeline([
    ('select',DataFrameSelecter([c])),
    ('fill',Imputer(strategy='median')),
    ('scale',StandardScaler()),
])) for c in summary.columns]

cat_cols=list(set(passenger_train.columns)-set(summary.columns))
cat_pipelines=[(c, Pipeline([
    ('select',DataFrameSelecter([c])),
    ('encode',LabelBinarizerEx()),
])) for c in cat_cols ]

full_pipeline=DfPipeline(num_pipelines+cat_pipelines)

prepared_passenger_train=full_pipeline.fit_transform(passenger_train)

prepared_passenger_train.head()


In [None]:
passenger_test=pd.read_csv('test.csv')
test_id=passenger_test[id_col]
drop_cols.remove(target_col)
passenger_test.drop(drop_cols,axis=1,inplace=True)
prepared_passenger_test=full_pl.transform(passenger_test)

## gradient boosting tree

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()

from sklearn.model_selection import cross_val_score

scores=cross_val_score(gbc,prepared_passenger_train,target,scoring='accuracy',cv=10)
scores.mean()


In [None]:
help(gbc)

In [None]:
from sklearn.model_selection import validation_curve

train_scores,test_scores=validation_curve(gbc,prepared_passenger_train,target,
                                          scoring='accuracy',param_name='learning_rate',param_range=[0.1],cv=10)
print(train_scores.mean(), test_scores.mean())
plotValidationCurve(train_scores,test_scores)

In [None]:
survived=clf.predict(prepared_passenger_test)
df=pd.DataFrame({id_col:test_id,target_col:survived})
df.to_csv('result.csv',index=False)

kaggle score:0.78469 rank: 3086

In [None]:
import time
time.time()

### tune params

In [None]:
# from sklearn.model_selection import GridSearchCV
# t1=time.time()
# param_grid={'max_depth':[2,3,5,8],'n_estimators':[50,100,150],'learning_rate':[0.1,0.01]}
# clf=GridSearchCV(GradientBoostingClassifier(),param_grid,scoring='accuracy',cv=10,n_jobs=4)
# clf.fit(prepared_passenger_train,target)
# print(clf.best_score_,clf.best_params_,time.time()-t1)

In [None]:
# survived=clf.predict(prepared_passenger_test)
# df=pd.DataFrame({id_col:test_id,target_col:survived})
# df.to_csv('result.csv',index=False)

kaggle score:0.78469 rank: 3073

## random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

from sklearn.model_selection import cross_val_score

scores=cross_val_score(rfc,prepared_passenger_train,target,scoring='accuracy',cv=10)
scores.mean()



In [None]:
help(rfc)

### tune params

In [None]:
from sklearn.model_selection import GridSearchCV

params={'n_estimators':[10,30,50,100],'max_features':[4,6,8],'max_depth':[4,8,16]}
gs=GridSearchCV(RandomForestClassifier(),params,scoring='accuracy',cv=10,n_jobs=-1)
gs.fit(prepared_passenger_train,target)
rfc=gs.best_estimator_
print(gs.best_score_,gs.best_params_)

In [None]:
from sklearn.model_selection import validation_curve

train_scores,test_scores=validation_curve(rfc,prepared_passenger_train,target,
                                          scoring='accuracy',param_name='n_estimators',param_range=[50],cv=20)
print(train_scores.mean(), test_scores.mean())
plotValidationCurve(train_scores,test_scores)

In [None]:
feature_scores=zip( rfc.feature_importances_,prepared_passenger_train.columns)
np.sort(list(feature_scores),0)

In [None]:
survived=rfc.predict(prepared_passenger_test)
df=pd.DataFrame({id_col:test_id,target_col:survived})
df.to_csv('result.csv',index=False)

kaggle score:0.79904 rank: 1381

## SVM

In [None]:
from sklearn.svm import SVC
# clf=LinearSVC(C=1,loss='hinge')
svc=SVC(kernel='poly',probability=True)

from sklearn.model_selection import cross_val_score
scores=cross_val_score(svc,prepared_passenger_train,target,scoring='accuracy',cv=10)
scores.mean()

## draw validation curve

In [None]:
from sklearn.model_selection import validation_curve

train_scores,test_scores=validation_curve(svc,prepared_passenger_train,target,scoring='roc_auc',param_name='C',param_range=[1],cv=20)
plotValidationCurve(train_scores,test_scores)

In [None]:
%matplotlib inline
def plotValidationCurve(train_scores,test_scores):
    from matplotlib import pyplot

    xaxis=[i for i in range(1,len(train_scores[0])+1)]
    pyplot.plot(xaxis,train_scores[0],label='train scores')
    pyplot.plot(xaxis,test_scores[0],label='test scores')
    pyplot.legend()
    pyplot.show()

In [None]:
import matplotlib.pyplot as plt
param_range = np.logspace(-6, -1, 5)
train_scores, test_scores = validation_curve(
    SVC(), prepared_passenger_train, target, param_name="gamma", param_range=param_range,
    cv=10, scoring="accuracy", n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SVM")
plt.xlabel("$\gamma$")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

use ensemble methods

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

vote=VotingClassifier(estimators=[('gbc',gbc),('rfc',rfc),('svc',svc)],voting='soft')
scores=cross_val_score(vote,prepared_passenger_train,target,scoring='accuracy',cv=10)
scores.mean()

## stacking

In [None]:
from brew.base import Ensemble, EnsembleClassifier
from brew.stacking.stacker import EnsembleStack, EnsembleStackClassifier
from brew.combination.combiner import Combiner

# Creating Ensemble
ensemble = Ensemble([gbc, rfc, svc])
eclf = EnsembleClassifier(ensemble=ensemble, combiner=Combiner('mean'))

# from sklearn.model_selection import cross_val_score
# scores=cross_val_score(eclf,prepared_passenger_train,target,scoring='accuracy',cv=10)
eclf.fit(prepared_passenger_train,target)
eclf.score(prepared_passenger_train,target)

In [None]:
survived=eclf.predict(prepared_passenger_test)
survived=[int(s) for s in survived]
df=pd.DataFrame({id_col:test_id,target_col:survived})
df.to_csv('result.csv',index=False)

not better

In [None]:
# Creating Stacking
layer_1 = Ensemble([gbc, rfc, svc])
layer_2 = Ensemble([sklearn.clone(gbc)])

stack = EnsembleStack(cv=3)

stack.add_layer(layer_1)
stack.add_layer(layer_2)

sclf = EnsembleStackClassifier(stack)
sclf.fit(prepared_passenger_train_data,target)

survived=sclf.predict(prepared_passenger_test)
survived=[int(s) for s in survived]
df=pd.DataFrame({id_col:test_id,target_col:survived})
df.to_csv('result.csv',index=False)

not better

## xgboost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

xgb=XGBClassifier()
scores=cross_val_score(xgb,prepared_passenger_train,target,scoring='accuracy',cv=10)
scores.mean()