Grid Search

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, classification_report, roc_auc_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

### Load training set

In [2]:
train_data = pd.read_csv('data/modelling/train_data_14_features.csv')
train_data

Unnamed: 0,user_id,product_id,up_product_cnt,up_reorder_ratio,up_mean_cart_position,up_mean_days_between_orders,up_mean_order_dow,up_mean_order_hod,u_total_orders,u_mean_products,u_mean_days_between_orders,u_mean_order_dow,p_mean_cart_position,p_order_count,p_reorder_ratio,p_mean_days_between_orders,reordered
0,1,196,10,0.900000,1.400000,17.600000,2.500000,10.300000,10,5.900000,17.272727,2.636364,3.721774,35791,0.776480,11.195803,1.0
1,1,10258,9,0.888889,3.333333,19.555556,2.555556,10.555556,10,5.900000,17.272727,2.636364,4.277492,1946,0.713772,11.099692,1.0
2,1,10326,1,0.000000,5.000000,28.000000,4.000000,15.000000,10,5.900000,17.272727,2.636364,4.191097,5526,0.652009,11.177705,0.0
3,1,12427,10,0.900000,3.300000,17.600000,2.500000,10.300000,10,5.900000,17.272727,2.636364,4.760037,6476,0.740735,9.955837,0.0
4,1,13032,3,0.666667,6.333333,21.666667,2.666667,8.000000,10,5.900000,17.272727,2.636364,5.622767,3751,0.657158,10.616636,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8474656,206209,43961,3,0.666667,8.000000,23.333333,1.666667,13.000000,13,9.923077,17.142857,3.000000,9.194723,55371,0.630583,10.353831,0.0
8474657,206209,44325,1,0.000000,8.000000,9.000000,2.000000,13.000000,13,9.923077,17.142857,3.000000,10.109900,3485,0.401148,11.569010,0.0
8474658,206209,48370,1,0.000000,8.000000,30.000000,4.000000,10.000000,13,9.923077,17.142857,3.000000,8.344942,3934,0.699288,10.382308,0.0
8474659,206209,48697,1,0.000000,6.000000,9.000000,2.000000,13.000000,13,9.923077,17.142857,3.000000,8.763058,9783,0.357661,10.430645,0.0


### Establish Features for the model

In [None]:
# Set feature and target variables
X = train_data.drop(['reordered', 
                     'p_mean_days_between_orders', 
                     'up_mean_cart_position', 
                     'up_mean_order_hod', 'up_mean_order_dow'], axis=1)
y = train_data.reordered

# Create the train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

### Search parameters

In [6]:
# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'n_estimators': [50, 100, 200],
    'eta': [0.05, 0.1, 0.3],
    'max_depth': [2, 3, 8]
}

# Instantiate the classifier
xgb_cl = xgb.XGBClassifier()

# Perform grid search: grid_mse
grid_prc = GridSearchCV(estimator=xgb_cl, param_grid=gbm_param_grid, scoring='f1', cv=3, verbose=1)


# Fit grid_mse to the data
grid_prc.fit(X_train,y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_prc.best_params_)
print("Best score found: ", np.sqrt(np.abs(grid_prc.best_score_)))
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters found:  {'eta': 0.3, 'max_depth': 8, 'n_estimators': 200}
Best score found:  0.4689287969978886



Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters found:  {'eta': 0.3, 'max_depth': 8, 'n_estimators': 200}
Best score found:  0.4689287969978886
