In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from sklearn.preprocessing import Imputer
%matplotlib inline

import sys
sys.path.append('../')
from pipeline import *
from onehot import *
from util import *
from null import *
from ordinal import *
from impute import *

In [None]:
train_data=pd.read_csv('raw_data/train.csv')
test_data=pd.read_csv('raw_data/test.csv')
train_data.head()

In [None]:
test_data.describe(include=['O'])

In [None]:
train_data.describe(include=['O'])

In [None]:
train_data['AgeuponOutcome'].unique()

In [None]:
AGE_DAYS={'y':365,'m':30,'w':7,'d':1}
    
for df in (train_data,test_data):
    df['SexuponOutcome']=MixImputer().fit_transform(df[['SexuponOutcome']])
    df['AgeuponOutcome']=MixImputer().fit_transform(df[['AgeuponOutcome']])
    df['Sex1']=df['SexuponOutcome'].apply(lambda x : x if len(x.split()) == 1 else x.split()[1])
    df['Sex2']=df['SexuponOutcome'].apply(lambda x : np.nan if len(x.split()) == 1 else x.split()[0])
    df['Age']=df['AgeuponOutcome'].apply(lambda s: int(s.split()[0])*AGE_DAYS[s.split()[1][0]])

    

In [None]:
    
outcome_types=['Adoption','Died','Euthanasia','Return_to_owner','Transfer']
train_pipeline=DataFramePipeline([
    FeaturePipeline('OutcomeType','OutcomeType',Pipeline([('onehot',Ordinar(outcome_types))])),
    FeaturePipeline('Name','HasName',Pipeline([('name',NotNull())])),
    FeaturePipeline('AnimalType','',Pipeline([('onehot',LabelBinarizerEx(['AnimalType']))])),
    FeaturePipeline('Sex1','',Pipeline([('onehot',LabelBinarizerEx(['Sex1']))])),
    FeaturePipeline('Sex2','',Pipeline([('onehot',LabelBinarizerEx(['Sex2']))])),
#     FeaturePipeline('Breed','',Pipeline([('onehot',LabelBinarizerEx(['Breed']))])),
#     FeaturePipeline('Color','',Pipeline([('onehot',LabelBinarizerEx(['Color']))])),
])

train_p=train_pipeline.fit_transform(train_data)
train_p.head()

In [None]:
train_target=train_p['OutcomeType']
train_pr=train_p.drop(['AnimalID','Name','DateTime','OutcomeType','OutcomeSubtype','AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color','Sex1','Sex2'],axis=1)
train_pr.head()

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier,LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

base_clfs=[
#       LogisticRegression(n_jobs=-1),
#       SVC(probability=True),
      RandomForestClassifier(n_jobs=-1),
#       GradientBoostingClassifier(),
#       AdaBoostClassifier(), 
#       ExtraTreesClassifier(n_jobs=-1), 
      XGBClassifier(),
]

from sklearn.model_selection import cross_val_score

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainModels(train_data, target):
    scores=[cross_val_score(clf,train_data,target,scoring='accuracy',cv=5,n_jobs=-1,verbose=1).mean() for clf in base_clfs]

    labels=[c.__class__.__name__[:3] for c in base_clfs]
    X=np.arange(len(base_clfs))
    bar(X,scores,tick_label=labels,color='rgb')
    ylim(0.5,1.0)
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True))
    
# trainModels(train_pr,train_target)
    

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_set=[
#                 {'C':[0.01,0.1,0.5,1.]},
#                 {'C':[1.,10.,],'kernel':['rbf','poly'],'gamma':[0.01,0.1,1.],'coef0':[1.,10.,]},
                {'n_estimators':[100,300,500],'max_depth':[5,10,15]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[3,5,8]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300]},
#                 {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
                {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,300,500],'max_depth':[5,10,15],'gamma':[0.01,0.1,0.5]},
               ]

def tuneModels(train_data,target):
    results=[]
    for i in range(len(base_clfs)):
        gs=GridSearchCV(estimator=base_clfs[i],param_grid=param_grid_set[i],scoring='accuracy',n_jobs=-1,verbose=1,cv=5)
        gs.fit(train_data,target)
        results.append((gs.best_estimator_,gs.best_score_))
    print(sorted(results,key=lambda x:x[1],reverse=True))
    return results

# results=tuneModels(train_pr,train_target)
# best_clfs=[r[0] for r in results]

In [None]:
best_clf=RandomForestClassifier(n_estimators=500,n_jobs=-1)
best_clf.fit(train_pr,train_target)

In [None]:
test_id=test_data['ID']
test_pipeline=DataFramePipeline(train_pipeline.pipelines[1:])
test_p=test_pipeline.transform(test_data)
test_pr=test_p.drop(['ID','Name','DateTime','AnimalType','SexuponOutcome','AgeuponOutcome','Breed','Color','Sex1','Sex2'],axis=1)
predicts=best_clf.predict(test_pr)
submission=test_data[['ID']]
i=0
for o in outcome_types:
    submission[o]=predicts==i
    submission[o]=submission[o].astype(int)
    i+=1
submission.to_csv('output/result.csv',index=False)