In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import pickle
import time

## Read data and remove outliers

In [2]:
df = pd.read_csv("./data/df_slow.csv")
summary = df["query_time_ns"]\
    .describe(percentiles=[0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99, 1.00])
mask = (df["query_time_ns"]<summary["99%"])
sum(mask)

df = df[mask]

X = df["clean_query"]
y = df['query_time_ns']
print(df.shape, X.shape, y.shape)

(15375, 4) (15375,) (15375,)


# Feature Extraction TF-IDF 
TF-IDF works by penalizing the common words by assigning them lower 
weights while giving importance to words which are rare in the entire 
corpus but appear in good numbers in few documents.

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

For cross validation, here I keep the valid in the trains set, cv will automaticly seperate valid set for each iter

In [4]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.1)
# xvalid, xtest, yvalid, ytest = train_test_split(xvalid, yvalid, test_size=0.3)
xtrain.shape,  xtest.shape, ytrain.shape, ytest.shape

((13837,), (1538,), (13837,), (1538,))

### TfidfVectorizer have to fit only train data

In [6]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit(xtrain)

xtrain_tfidf = tfidf_vectorizer.transform(xtrain)
xtest_tfidf = tfidf_vectorizer.transform(xtest)

print("Number of features -tokens- in train set :", len(tfidf.get_feature_names()))

xtrain_tfidf.shape, xtest_tfidf.shape, ytrain.shape, ytest.shape

Number of features -tokens- in train set : 922


((13837, 922), (1538, 922), (13837,), (1538,))

In [43]:
import joblib

joblib_tfidf_vectorizer = "./models/tfidf_vectorizer.sav"
joblib.dump(tfidf_vectorizer, joblib_tfidf_vectorizer)

['./models/tfidf_vectorizer.sav']

Don't need standadization since "TfidfVectorizer combines all 
the options of CountVectorizer and TfidfTransformer in a single model

### Prepare a function to evaluate models

In [7]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

Create a function to evaluate each model. I will evaluate each model by size, prediction time, RMSE, MAE and r2 for train, validation and test sets.
The coefficient of determination: 1 is perfect prediction
R2  computes how much better the regression line fits the data 
than the mean line.
Another way to look at this formula is to compare the variance 
around the mean line to the variation around the regression line

In [8]:
def eval_model(model, x, y, data_type, prn = False):
    start = time.time()
    pred = model.predict(x)
    pred_time = np.round((time.time() - start),4)
    if prn:
        print(f"{data_type} - Root mean squared error     : {(mean_squared_error(y, pred, squared=False)):.2f}")
        print(f"{data_type} - Mean absolute error         : {(mean_absolute_error(y, pred)):.2f}")
        print(f"{data_type} - Coefficient of determination: {(r2_score(y, pred)):.2f}")
        print(f"{data_type} - Time elapsed                : {pred_time}\n")
    return np.round(mean_squared_error(y, pred, squared=False),1), \
            np.round(mean_absolute_error(y, pred),1), \
            np.round(r2_score(y, pred),2), \
            pred_time


In [9]:
df_performance = pd.read_csv("./data/df_performance.csv")
df_performance.set_index("Model", inplace=True)
df_performance

Unnamed: 0_level_0,Size,Train RMSE,Train MAE,Train R2,Time Elapsed,Valid RMSE,Valid MAE,Valid R2,Valid Elapsed,Test RMSE,Test MAE,Test R2,Test Elapsed
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
linreg,7480,114889.6,40494.6,0.98,0.002,131939.0,47322.9,0.97,0.0011,136721.9,50345.4,0.97,0.0006
svc_regr,7627,891923.7,425866.1,-0.29,0.001,887686.8,426950.8,-0.3,0.001,868634.7,421458.7,-0.31,0.0
rand_for_reg,24827326,68386.5,20902.1,0.99,0.1362,120462.3,36910.6,0.98,0.0531,127260.4,38965.4,0.97,0.0267
ada_boost_reg,12763,235785.3,150016.3,0.91,0.0104,241501.9,153238.2,0.9,0.001,237806.8,154167.0,0.9,0.0
grd_boost_reg,119711,109424.9,41811.4,0.98,0.0131,118453.6,44572.5,0.98,0.004,125485.8,46833.9,0.97,0.004
xgb_reg,218735,70951.6,25194.7,0.99,0.0076,122792.2,36972.1,0.98,0.004,132979.8,39993.4,0.97,0.001


### I picked 2 models to ensemble: XGBRegressor and RandomForestRegressor

In [10]:
from xgboost import XGBRegressor
xgb_reg = XGBRegressor()
xgb_reg.fit(xtrain_tfidf, ytrain)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [11]:
from sklearn.ensemble import RandomForestRegressor
rand_forest_reg = RandomForestRegressor()
rand_forest_reg.fit(xtrain_tfidf, ytrain)


RandomForestRegressor()

In [12]:
def eval_ensamble_model(model1, model2, x, y, data_type, prn = False):
    start = time.time()
    pred1 = model1.predict(x)
    pred2 = model2.predict(x)
    pred = (pred1+pred2)/2
    pred_time = np.round((time.time() - start),4)
    if prn:
        print(f"{data_type} - Root mean squared error     : {(mean_squared_error(y, pred, squared=False)):.2f}")
        print(f"{data_type} - Mean absolute error         : {(mean_absolute_error(y, pred)):.2f}")
        print(f"{data_type} - Coefficient of determination: {(r2_score(y, pred)):.2f}")
        print(f"{data_type} - Time elapsed                : {pred_time}\n")
    return np.round(mean_squared_error(y, pred, squared=False),1), \
            np.round(mean_absolute_error(y, pred),1), \
            np.round(r2_score(y, pred),2), \
            pred_time

Ensemble does not improve the test r2

In [14]:
tr_rmse, tr_mae, tr_r2, tr_time_elapsed  = eval_ensamble_model(xgb_reg, rand_forest_reg,  xtrain_tfidf, ytrain, data_type="Train", prn = True)
test_rmse, test_mae, test_r2, test_time_elapsed  = eval_ensamble_model(xgb_reg, rand_forest_reg, xtest_tfidf, ytest, data_type="Test", prn = True)

Train - Root mean squared error     : 73120.51
Train - Mean absolute error         : 23735.29
Train - Coefficient of determination: 0.99
Train - Time elapsed                : 0.2188

Test - Root mean squared error     : 131031.85
Test - Mean absolute error         : 39456.78
Test - Coefficient of determination: 0.97
Test - Time elapsed                : 0.0484



### Grid search and Random Search on Cross Validation Sets for Best Parameters

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint


In [16]:
# XGBRegressor
params = {
 'max_depth':range(3,6,1),
 'min_child_weight':range(1,3,1)
}
xgb_reg_grid_searc = GridSearchCV(estimator = XGBRegressor( learning_rate =0.1, n_estimators=140, max_depth=5),
                                    param_grid = params, 
                                    scoring="r2",
                                    verbose=2
                                 )
# pprint(params)

In [17]:
xgb_reg_grid_searc.fit(xtrain_tfidf, ytrain)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ....................max_depth=3, min_child_weight=1; total time=   0.8s
[CV] END ....................max_depth=3, min_child_weight=1; total time=   0.8s
[CV] END ....................max_depth=3, min_child_weight=1; total time=   0.8s
[CV] END ....................max_depth=3, min_child_weight=1; total time=   0.8s
[CV] END ....................max_depth=3, min_child_weight=1; total time=   0.8s
[CV] END ....................max_depth=3, min_child_weight=2; total time=   0.8s
[CV] END ....................max_depth=3, min_child_weight=2; total time=   0.8s
[CV] END ....................max_depth=3, min_child_weight=2; total time=   0.8s
[CV] END ....................max_depth=3, min_child_weight=2; total time=   0.8s
[CV] END ....................max_depth=3, min_child_weight=2; total time=   0.8s
[CV] END ....................max_depth=4, min_child_weight=1; total time=   1.0s
[CV] END ....................max_depth=4, min_chi

GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=5, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=140, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
          

In [18]:
xgb_reg_grid_searc.best_params_

{'max_depth': 4, 'min_child_weight': 2}

Automaticly proceed to the model

In [25]:
max_depth = xgb_reg_grid_searc.best_params_["max_depth"]
min_child_weight = xgb_reg_grid_searc.best_params_["min_child_weight"]

In [27]:
xgb_model = XGBRegressor(max_depth = max_depth, min_child_weight = min_child_weight)
xgb_model.fit(xtrain_tfidf, ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=4,
             min_child_weight=2, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [28]:
import joblib

In [29]:
joblib_xgb_model = "./models/xgb_model.sav"
joblib.dump(xgb_model, joblib_xgb_model)

['./models/xgb_model.sav']

In [30]:
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

params = {'max_depth': max_depth,
          'min_samples_leaf': min_samples_leaf}

pprint(params)

{'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_leaf': [1, 2, 5, 10]}


In [32]:
# This time RandomizedSearchCV because there are too many params to search!
rand_forest_reg_grid_searc = RandomizedSearchCV(
                                estimator = RandomForestRegressor(), 
                                param_distributions = params, 
                                n_iter = 5, 
                                cv = 5, 
                                verbose=2)
rand_forest_reg_grid_searc.fit(xtrain_tfidf, ytrain)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ...................max_depth=25, min_samples_leaf=2; total time=  20.8s
[CV] END ...................max_depth=25, min_samples_leaf=2; total time=  21.5s
[CV] END ...................max_depth=25, min_samples_leaf=2; total time=  21.6s
[CV] END ...................max_depth=25, min_samples_leaf=2; total time=  21.8s
[CV] END ...................max_depth=25, min_samples_leaf=2; total time=  21.7s
[CV] END ...................max_depth=15, min_samples_leaf=2; total time=  16.2s
[CV] END ...................max_depth=15, min_samples_leaf=2; total time=  17.3s
[CV] END ...................max_depth=15, min_samples_leaf=2; total time=  16.9s
[CV] END ...................max_depth=15, min_samples_leaf=2; total time=  16.3s
[CV] END ...................max_depth=15, min_samples_leaf=2; total time=  16.7s
[CV] END ...................max_depth=10, min_samples_leaf=5; total time=  10.7s
[CV] END ...................max_depth=10, min_sam

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=5,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'min_samples_leaf': [1, 2, 5, 10]},
                   verbose=2)

In [33]:
rand_forest_reg_grid_searc.best_params_

{'min_samples_leaf': 2, 'max_depth': 15}

In [34]:
max_depth = rand_forest_reg_grid_searc.best_params_["max_depth"]
min_samples_leaf = rand_forest_reg_grid_searc.best_params_["min_samples_leaf"]

In [35]:
rand_forest_model = RandomForestRegressor(max_depth = max_depth, min_samples_leaf = min_samples_leaf)
rand_forest_model.fit(xtrain_tfidf, ytrain)

RandomForestRegressor(max_depth=15, min_samples_leaf=2)

In [36]:
joblib_rand_forest_model = "./models/rand_forest_model.sav"
joblib.dump(rand_forest_model, joblib_rand_forest_model)

['./models/rand_forest_model.sav']

### Ensemble Results with Optimized Parameters

In [37]:
tr_rmse, tr_mae, tr_r2, tr_time_elapsed  = eval_ensamble_model(xgb_model, rand_forest_model,  xtrain_tfidf, ytrain, data_type="Train", prn = True)
test_rmse, test_mae, test_r2, test_time_elapsed  = eval_ensamble_model(xgb_model, rand_forest_model, xtest_tfidf, ytest, data_type="Test", prn = True)

Train - Root mean squared error     : 84717.06
Train - Mean absolute error         : 28120.54
Train - Coefficient of determination: 0.99
Train - Time elapsed                : 0.148

Test - Root mean squared error     : 126089.19
Test - Mean absolute error         : 39514.49
Test - Coefficient of determination: 0.97
Test - Time elapsed                : 0.029



### Random Forest Regressor Results After Parameter Search : Single Model

In [40]:
model_name = "rand_forest_model"
p = pickle.dumps(eval(model_name))
model_size = sys.getsizeof(p)
print(f"{model_name} size                         : {model_size}")
tr_rmse, tr_mae, tr_r2, tr_time_elapsed  = eval_model(eval(model_name), xtrain_tfidf, ytrain, data_type="Train", prn=True)
test_rmse, test_mae, test_r2, test_time_elapsed  = eval_model(eval(model_name), xtest_tfidf, ytest, data_type="Test", prn=True)

rand_forest_model size                         : 8415139
Train - Root mean squared error     : 85607.89
Train - Mean absolute error         : 26903.64
Train - Coefficient of determination: 0.99
Train - Time elapsed                : 0.127

Test - Root mean squared error     : 127039.93
Test - Mean absolute error         : 40441.75
Test - Coefficient of determination: 0.97
Test - Time elapsed                : 0.022



### XGBRegressor Results After Parameter Search : Single Model

In [41]:
model_name = "xgb_model"
p = pickle.dumps(eval(model_name))
model_size = sys.getsizeof(p)
print(f"{model_name} size                         : {model_size}")
tr_rmse, tr_mae, tr_r2, tr_time_elapsed  = eval_model(eval(model_name), xtrain_tfidf, ytrain, data_type="Train", prn=True)
test_rmse, test_mae, test_r2, test_time_elapsed  = eval_model(eval(model_name), xtest_tfidf, ytest, data_type="Test", prn=True)

xgb_model size                         : 139305
Train - Root mean squared error     : 85736.23
Train - Mean absolute error         : 30218.58
Train - Coefficient of determination: 0.99
Train - Time elapsed                : 0.01

Test - Root mean squared error     : 129014.32
Test - Mean absolute error         : 39822.54
Test - Coefficient of determination: 0.97
Test - Time elapsed                : 0.004



## I will go with xgboost regressor. So I will use the entire data set to create a new model in a pipeline