In [1]:
### LIBRARIES AND FUNCTIONS ###

In [2]:
!pip install "numpy==1.21.6"
!pip install "pandas==1.3.5"
!pip install "xgboost==1.6.1"
!pip install "nltk==3.2.5"
!pip install "tensorflow==2.8.2"
!pip install "keras==2.8.0"
!pip install "keras-Preprocessing==1.1.2"
!pip install "scikit-learn==1.0.2"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import datetime
import regex as re
import random
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords as nltk_stopwords
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
def rmse(a, b):
    return mean_squared_error(a,b, squared = False)

In [5]:
### GET DATA ###

In [6]:
rdf = pd.read_csv('ask__2022-06-05__22-47-32.csv', escapechar='\\', delimiter= ',')
rdf.shape

(2511, 11)

In [7]:
data = rdf['Combine']
labels = rdf['Geometric_Mean']
labels_np = np.array(labels)

In [8]:
### DATA PROCESSING ###

In [9]:
# remove punctuation
data = data.apply(lambda x: re.sub(r'[^\w]', ' ', x))

In [10]:
# make lowercase
data = data.apply(lambda x: x.lower())

In [11]:
# remove stopwords
nltk.download('stopwords')
stopwords = nltk_stopwords.words('english')
data = data.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
### TOKENIZATION ###

In [13]:
# we adjust the max tokenization length to the length of 99% of the data
# sort in ascending order with reverse = False 
lengths = sorted([len(elem) for elem in data], reverse = False)
index = int(len(lengths) * 0.99)
maxlen = lengths[index]

In [14]:
# we use the same tokenizer for training and test data
tokenizer = Tokenizer(oov_token = "<OOV>")
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
tokenized_data = pad_sequences(sequences, maxlen = maxlen, padding = 'post')

In [15]:
### TRAIN TEST SPLIT ###

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(tokenized_data, labels_np, test_size = 0.1)

In [17]:
### XGBOOST ###

In [18]:
xgb_reg = xgb.XGBRegressor()

In [19]:
hyperparameter_grid = {'learning_rate': [0.15, 0.155, 0.16],
                       'max_depth': [12, 13, 14],
                       'colsample_bytree': [0.20, 0.21, 0.22],
                       'gamma': [10],
                       'reg_lambda': [1, 1.1, 1.2],
                       'reg_alpha': [0],
                       'subsample': [1],
                       'min_child_weight': [1],
                       'n_estimators':[50]
                       }

In [20]:
rand_xgb_reg = GridSearchCV(estimator = xgb_reg, 
                            param_grid = hyperparameter_grid,
                            n_jobs = -1, 
                            cv = 5,
                            scoring = 'neg_root_mean_squared_error',
                            verbose = 1,
                            return_train_score = False)

In [21]:
grid_search_start = datetime.datetime.now()

In [22]:
rand_xgb_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=None, gpu_id=None, grow_policy=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_bin=None,
                                    max_cat...
                                    n_jobs=None, num_parallel_tree=None,
                                    predictor=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None, ...),
             n_jobs=-1,
             

In [23]:
grid_search_end = datetime.datetime.now()
grid_search_duration = grid_search_end - grid_search_start

In [24]:
print(round(grid_search_duration.seconds/60,1), "minutes")

13.5 minutes


In [25]:
round(-rand_xgb_reg.best_score_,4)

8.4268

In [26]:
def best_hyperparameters(hyperparameter_grid):
    # this function prints the values from the hyperparameter search, for which hyperparameter_grid has more than one option
    var_hp = [x for x in hyperparameter_grid if len(hyperparameter_grid[x]) > 1]
    best_hyperparameters = {}
    for elem in rand_xgb_reg.best_params_:
        if elem in var_hp:
            best_hyperparameters[elem] = rand_xgb_reg.best_params_[elem]
    return best_hyperparameters

In [27]:
best_hyperparameters(hyperparameter_grid)

{'colsample_bytree': 0.2,
 'learning_rate': 0.155,
 'max_depth': 12,
 'reg_lambda': 1}

In [28]:
new_xgb_reg = xgb.XGBRegressor(colsample_bytree = rand_xgb_reg.best_params_['colsample_bytree'],
                               gamma = rand_xgb_reg.best_params_['gamma'],
                               learning_rate = rand_xgb_reg.best_params_['learning_rate'],
                               max_depth = rand_xgb_reg.best_params_['max_depth'],
                               min_child_weight = rand_xgb_reg.best_params_['min_child_weight'],
                               n_estimators = rand_xgb_reg.best_params_['n_estimators'],
                               reg_alpha = rand_xgb_reg.best_params_['reg_alpha'],
                               reg_lambda = rand_xgb_reg.best_params_['reg_lambda'],
                               subsample = rand_xgb_reg.best_params_['subsample'])

In [29]:
new_xgb_reg.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.2,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=10, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.155, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=12, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [30]:
predictions = new_xgb_reg.predict(X_test)
error = rmse(predictions, Y_test)
round(error,4)

7.9695

In [31]:
### RANDOMIZED SET RMSE ###

In [32]:
# we create a random distribution of data, to compare with the result of the model

In [33]:
size = Y_test.shape[0]
min = Y_test.min()
max = Y_test.max()
random_samples = np.random.uniform(low = min, high = max, size=(size,))

In [34]:
random_samples_error = rmse(random_samples,Y_test)
round(random_samples_error,2)

20.66