In [None]:
import time
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import Imputer
%matplotlib inline

import sys
sys.path.append('../')
from pipeline import *
from onehot import *
from util import *
from null import *
from ordinal import *
from impute import *
from ensemble import *

import multiprocessing
jobs=multiprocessing.cpu_count()-1

In [None]:
train_data=pd.read_csv('raw_data/train.csv')
test_data=pd.read_csv('raw_data/test.csv')
train_data.head()

In [None]:
test_data.describe(include=['O'])

In [None]:
train_data.describe(include=['O'])

In [None]:
AGE_DAYS={'y':365,'m':30,'w':7,'d':1}
                
def dict_f(x):
    items=[]
    for a in x.split('/'):
        for b in a.split():
            items.append(b)
    return items

train_p=train_data.copy()
test_p=test_data.copy()
                
print(len(train_p.columns),len(test_p.columns))
for df in (train_p,test_p):
    df['SexuponOutcome']=MixImputer().fit_transform(df[['SexuponOutcome']])
    df['AgeuponOutcome']=MixImputer().fit_transform(df[['AgeuponOutcome']])
    df['Sex1']=df['SexuponOutcome'].apply(lambda x : x if len(x.split()) == 1 else x.split()[1])
    df['Sex2']=df['SexuponOutcome'].apply(lambda x : np.nan if len(x.split()) == 1 else x.split()[0])
    df['Age']=df['AgeuponOutcome'].apply(lambda s: int(s.split()[0])*AGE_DAYS[s.split()[1][0]])
    df['InDays']=(np.datetime64('2016-02-22')-pd.DatetimeIndex(df['DateTime']).values)/np.timedelta64(1,'D')
    df['NameLen']=df['Name'].astype('U').apply(lambda x:len(x))
    
t=time.time()
for fe in ('Breed','Color'):
    for i in train_p.index:
        items=dict_f(train_p.loc[i,fe])
        for itm in items:
            c=fe+'_'+itm
            if c not in train_p.columns:
                train_p[c]=0
                test_p[c]=0
            train_p.loc[i,c]=1
        c=fe+'_len'
        train_p.loc[i,c]=len(items)
        if c not in test_p.columns:
            test_p[c]=0

print('time: %ds'%int(time.time()-t))
t=time.time()

for fe in ('Breed','Color'):
    for i in test_p.index:
        items=dict_f(test_p.loc[i,fe])
        for itm in items:
            c=fe+'_'+itm
            if c in test_p.columns: 
                test_p.loc[i,c]=1
        test_p.loc[i,fe+'_len']=len(items)

print('time: %ds'%int(time.time()-t))            
print(len(train_p.columns),len(test_p.columns))

OUTCOME_TYPES=['Adoption','Died','Euthanasia','Return_to_owner','Transfer']
train_pipeline=DataFramePipeline([
    FeaturePipeline('OutcomeType','OutcomeType',Pipeline([('onehot',Ordinar(OUTCOME_TYPES))])),
    FeaturePipeline('Name','HasName',Pipeline([('name',NotNull())])),
    FeaturePipeline('AnimalType','',Pipeline([('onehot',LabelBinarizerEx(['AnimalType']))])),
    FeaturePipeline('Sex1','',Pipeline([('onehot',LabelBinarizerEx(['Sex1']))])),
    FeaturePipeline('Sex2','',Pipeline([('onehot',LabelBinarizerEx(['Sex2']))])),
    
])

train_p=train_pipeline.fit_transform(train_p)
train_target=train_p['OutcomeType']
train_pr=train_p.drop(['AnimalID','Name','DateTime','OutcomeType','OutcomeSubtype','AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color','Sex1','Sex2'],axis=1)

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier,LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

base_clfs=[
#     LogisticRegression(n_jobs=jobs),
#     SVC(probability=True),
#     KNeighborsClassifier(n_jobs=jobs,n_neighbors=),
#     RandomForestClassifier(n_jobs=jobs),
#     GradientBoostingClassifier(),
#       AdaBoostClassifier(), 
#       ExtraTreesClassifier(n_jobs=-1), 
    XGBClassifier(n_jobs=jobs,random_state=0),
]

from sklearn.model_selection import cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainModels(train_data, target):
    scores=[cross_val_score(clf,train_data,target,scoring='accuracy',cv=3,n_jobs=jobs,verbose=1).mean() for clf in base_clfs]

    labels=[c.__class__.__name__[:3] for c in base_clfs]
    X=np.arange(len(base_clfs))
    bar(X,scores,tick_label=labels,color='rgb')
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True))
    
# trainModels(train_pr,train_target)

from sklearn.model_selection import GridSearchCV

param_grid_set=[
#                 {'C':[0.01,0.1,0.5,1.]},
#                 {'C':[1.,10.,],'kernel':['rbf','poly'],'gamma':[0.01,0.1,1.],'coef0':[1.,10.,]},
#                 {'n_estimators':[100,500],'max_depth':[10,15]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,300],'max_depth':[5,8]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300]},
#                 {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
                {'learning_rate':[0.01,0.1,1.0],'n_estimators':[500,1000,2000],'gamma':[0.01,0.1,0.5]},
#     {'n_neighbors':[3,5,10]},
               ]

def tuneModels(train_data,target):
    results=[]
    for i in range(len(base_clfs)):
        gs=GridSearchCV(estimator=base_clfs[i],param_grid=param_grid_set[i],scoring='accuracy',n_jobs=jobs,verbose=1,cv=2)
        gs.fit(train_data,target)
        results.append((gs.best_estimator_,gs.best_score_))
    results=sorted(results,key=lambda x:x[1],reverse=True)
    return results

results=tuneModels(train_pr,train_target)
print(results)
    

In [None]:
# t=time.time()
# best_clf=GradientBoostingClassifier()
# best_clf.fit(train_pr,train_target)
# int(time.time()-t),best_clf.score(train_pr,train_target)

In [None]:
# base_clfs=[
# #     LogisticRegression(random_state=0,n_jobs=jobs),
# #     SVC(probability=True),
# #     SGDClassifier(loss='log'),
# #     MLPClassifier(),
# #     KNeighborsClassifier(n_jobs=jobs),
#     RandomForestClassifier(random_state=0,n_jobs=jobs),
# #     GradientBoostingClassifier(),
# #     AdaBoostClassifier(), 
#     ExtraTreesClassifier(random_state=0, n_jobs=jobs), 
#     XGBClassifier(random_state=0,n_jobs=jobs),
# ]

# from brew.base import Ensemble
# from brew.stacking import EnsembleStackClassifier,EnsembleStack
# import sklearn

# clfs=base_clfs
# layer_1 = Ensemble(clfs)
# layer_2 = Ensemble([LogisticRegression(random_state=0,n_jobs=jobs)])

# stack = EnsembleStack(cv=len(clfs))

# stack.add_layer(layer_1)
# stack.add_layer(layer_2)

# sclf = EnsembleStackClassifierEx(stack)
# sclf.fit(train_pr.values,train_target.astype('int'))

In [None]:
# from brew.base import Ensemble, EnsembleClassifier
# from brew.combination.combiner import Combiner
# en=Ensemble(base_clfs)
# eclf = EnsembleClassifier(ensemble=en, combiner=Combiner('mean'))
# eclf.fit(train_pr.values,train_target.astype('int'))

In [None]:
t=time.time()
xgb=XGBClassifier(n_estimators=1000,gamma=0.1,learning_rate=0.1,n_jobs=jobs)
xgb.fit(train_pr,train_target)
int(time.time()-t),xgb.score(train_pr,train_target)

In [None]:
test_id=test_data['ID']
test_pipeline=DataFramePipeline(train_pipeline.pipelines[1:])
test_pp=test_pipeline.transform(test_p)
test_pr=test_pp.drop(['ID','Name','DateTime','AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color','Sex1','Sex2'],axis=1)
predicts=xgb.predict_proba(test_pr)
submission=pd.concat([test_data[['ID']],pd.DataFrame(predicts,columns=OUTCOME_TYPES)],axis=1)
submission.to_csv('output/result.csv',index=False)