In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import accuracy_score, recall_score, precision_score, \
f1_score, roc_curve, auc, roc_auc_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from collections import OrderedDict, Counter
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

sns.set(font_scale=1.2)


In [2]:
with open('processing/0806_1pm_df_hod.pkl', 'rb') as f:
    df_hod = pickle.load(f)

In [3]:
#try do one validation

y_train = df_hod['reordered']
X_train = df_hod.drop(columns=['reordered', 'user_id', 'product_id'])
feature_names=X_train.columns

In [4]:
X_train.shape, y_train.shape

((7685556, 26), (7685556,))

In [5]:
gbm =  xgb.XGBClassifier( 
                       n_estimators = 339, #arbitrary large number
                       max_depth = 4,
                       objective = "binary:logistic",
                       learning_rate = 0.025, 
                       subsample = 0.7,
                       min_child_weight = 1,
                       colsample_bytree = 0.6
                          )

In [6]:
fit_model = gbm.fit( 
            X_train, y_train, 
           )

In [7]:
with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(fit_model, f)

In [9]:
with open('feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

In [8]:
fit_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0, learning_rate=0.025,
       max_delta_step=0, max_depth=4, min_child_weight=1, missing=None,
       n_estimators=339, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.7)

In [2]:
!ls result

feature_names.pkl  xgb_model.pkl


In [None]:
def cv_xgb(df, cv=5, n_estimators=30000, max_depth=3, objective="binary:logistic", learning_rate=.1, subsample=1, 
           min_child_weight=1, colsample_bytree=.8, v=False):
    rec_list=[]
    prec_list=[]
    f1_list=[]
    auc_list=[]
    f1_train=[]  #check train score if see if the model overfits
    i=0
    highest_f1=[]
    if v:
        fig, ax = plt.subplots(1,3, figsize=(15,5))
    
    kf = KFold(n_splits=cv, shuffle=True)
    for user_train_ind, user_val_ind in kf.split(df.user_id.unique()):
    #print(user_train.shape, user_val.shape)
        user_train=df.user_id.unique()[user_train_ind]
        user_val=df.user_id.unique()[user_val_ind]
        df1 = df[df.user_id.isin(user_train)]
        y_train = df1['reordered']
        X_train = df1.drop(columns=['reordered', 'user_id', 'product_id'])
        feature_names=X_train.columns
        df2 = df[df.user_id.isin(user_val)]
        y_val = df2['reordered']
        X_val = df2.drop(columns=['reordered', 'user_id', 'product_id'])    
        #print(user_train.shape, user_val.shape, X_train.shape, X_val.shape, y_train.shape, y_val.shape)
        
        gbm =  xgb.XGBClassifier( 
                               n_estimators = n_estimators, #arbitrary large number
                               max_depth = max_depth,
                               objective = objective,
                               learning_rate = learning_rate, 
                               subsample = subsample,
                               min_child_weight = min_child_weight,
                               colsample_bytree = colsample_bytree
                                  )
        eval_set=[(X_train,y_train),(X_val,y_val)]
        fit_model = gbm.fit( 
                    X_train, y_train, 
                    eval_set=eval_set,
                    eval_metric='auc',
                    early_stopping_rounds=50,
                    verbose=False #gives output log as below
                   )
        y_pred = gbm.predict(X_val, ntree_limit=gbm.best_ntree_limit)
        y_pred_train = gbm.predict(X_train, ntree_limit=gbm.best_ntree_limit)
    
        #acc_list.append(accuracy_score(y_val, y_pred))
        rec_list.append(recall_score(y_val, y_pred))
        prec_list.append(precision_score(y_val, y_pred))
        f1_list.append(f1_score(y_val, y_pred))
        f1_train.append(f1_score(y_train, y_pred_train))
        
        y_pred_prob = gbm.predict_proba(X_val, ntree_limit=gbm.best_ntree_limit)[:,1]
        fpr, tpr, _ = roc_curve(y_val, y_pred_prob, pos_label=1)
        auc_list.append(auc(fpr, tpr))
        if v:
            i += 1
            ax[0].plot(fpr, tpr, label='ROC Fold %d' % i)
        rec, prec, thres = precision_recall_curve(y_val, y_pred_prob, pos_label=1)
        if v:
            ax[1].plot(rec, prec, label ='Fold %d' %i)
        f1s = []
        for r, p, t in zip(rec, prec, thres):
            if r and p:
                f1s.append((2*r*p/(r+p), t, r, p))
        if v:
            print('Fold %d highest F1: %.3f at threshold p= %.3f. Recall is %.3f, precision is %.3f' 
                  % (i, *sorted(f1s, reverse=True)[0]))
        highest_f1.append(sorted(f1s, reverse=True)[0][0])
        if v:
            ax[2].plot(list(zip(*f1s))[1], list(zip(*f1s))[0], label='Fold %d' %i)
    if v:        
        plt.legend()
        ax[0].set_xlabel('FPR')
        ax[0].set_ylabel('TPR')
        ax[0].set_title('ROC Curve')
        ax[1].set_xlabel('Recall')
        ax[1].set_ylabel('Precision')
        ax[1].set_title('Precision-Recall Curve')
        ax[2].set_xlabel('P threshold')
        ax[2].set_ylabel('F1 score')
        ax[2].set_title('F1 Score vs. Positive Class Decision Probability Threshold')

    if v:
        #mean_acc = np.mean(acc_list)
        #std_acc = np.std(acc_list)
        mean_rec = np.mean(rec_list)
        std_rec = np.std(rec_list)
        mean_prec = np.mean(prec_list)
        std_prec = np.std(prec_list) 
        #print('mean accuracy: %.3f, std accuracy: %.3f' % (mean_acc, std_acc))
        print('mean recall: %.3f, std recall: %.3f' % (mean_rec, std_rec))  
        print('mean precision: %.3f, std accuracy: %.3f' % (mean_prec, std_prec))
    
    mean_f1 = np.mean(f1_list)
    std_f1 = np.std(f1_list)
    mean_auc = np.mean(auc_list)
    std_auc = np.std(auc_list)
    mean_f1_train = np.mean(f1_train)
    std_f1_train = np.std(f1_train)
    mean_highest_f1 = np.mean(highest_f1)
    std_highest_f1 = np.std(highest_f1)
    print('mean highest_f1: %.3f, std highest_f1: %.3f' % (mean_highest_f1, std_highest_f1))
    print('mean F1: %.3f, std AUC: %.3f' % (mean_f1, std_f1))
    print('mean AUC: %.3f, std AUC: %.3f' % (mean_auc, std_auc))
    print('mean train F1: %.3f, std train F1: %.3f' % (mean_f1_train, std_f1_train))
    return gbm, feature_names, mean_highest_f1