v1.0

The objective of this notebook is to:
- perform a hyper-parameter optimization/search


# Pre-checks

In [1]:
## Check for Google Drive Connectivity
try:
    from google.colab import drive
    drive.mount('/content/drive')
    google_env = True
except:
    print("Not a Goolge Drive Environment.")
    google_env = False

Not a Goolge Drive Environment.


In [2]:
import time
t_start = time.time()

In [3]:
# Use parameters to enable/disable the fastrun

FASTRUN = False
#FASTRUN = True

# Size of the fastrun dataframe
sample_fraction = 0.001
print("==="*40)
print("Fastrun enabled:", FASTRUN)
if FASTRUN: print("Sample size:",sample_fraction)
print("==="*40)


Fastrun enabled: False


# Setup Environment

## Install Modules

In [4]:
#!pip install watermark
#!pip install icecream

## Import Modules

In [5]:
# Base libraries
import os
from datetime import date

# Scientific libraries
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12,8)

sns.set(rc={'figure.figsize':(12,8)})
sns.set(font_scale=0.8)

# Helper libraries
from watermark import watermark
from icecream import ic
import gc # garbage collection to optimize memory usage: use "gc.collect()"
import warnings
warnings.filterwarnings('ignore')

# Pandas options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Load magic commands
%load_ext watermark

## Global Parameters

In [6]:
seed = 42
cpu_count = os.cpu_count()
cpu_count = cpu_count-2  # to keep machine responsive when fitting the models
notebook_no = "09.01"
today = date.today()

## Global Helper Functions

In [7]:
import http.client
import urllib


def send_push(message):
    """Send push notifications to pushover service."""
    try:
        conn = http.client.HTTPSConnection("api.pushover.net:443")
        conn.request("POST", "/1/messages.json",
                     urllib.parse.urlencode({
                         "token": "ahs1q4mwpnxe3645zeaqzas69whq7a",  # ML Notifications Channel
                         "user": "u5vr1qkc9ghudg2ehuug153okeiz1d",
                         "message": message,
                     }), {"Content-type": "application/x-www-form-urlencoded"})

        conn.getresponse()

    except:
        print("There was a communication issue (pushover).")


In [8]:
def fast_run_sampling(df, sample_fraction):
    """Return a fraction of the dataset."""
    print("Labels before sampling:\n", df['label'].value_counts())
    df = df.groupby('label').sample(frac=sample_fraction, random_state=seed) 
    #df = df.sample(n=sample_size, random_state=seed).reset_index(drop=True)
    print("Labels after sampling:\n", df['label'].value_counts())
    return df
    

In [9]:
def calculate_duration(t_start, t_end):
    """Return run time in minutes."""
    total_runtime = t_end-t_start
    total_runtime_min = round((total_runtime/60), 2)
    print(str(total_runtime_min)+" minutes")

    return total_runtime_min


# 3.0 Load Data

In [10]:
# Load Data
if google_env:
    # Location for "shared with" people
    # create a shortcut of the shared folder in your Google Drive root folder
    ROOT_PATH = "/content/drive/MyDrive/SIADS696/Environment/"
else:
    ROOT_PATH = "../"
    print("Not a Google Drive Environment. Loading local files.")

PATH_DATA = "data/"
PATH_DATA_RAW = "data/raw/"
PATH_DATA_INT = "data/interim/"
PATH_DATA_PRO = "data/processed/"
PATH_DATA_MOD = "models/"
PATH_DATA_REP = "reports/"
PATH_DATA_FIG = "reports/figures/"
PATH_DATA_HTML = "reports/html/"

# Dictionary to store file names and their corresponding dataframes
files = {
    "train_features_clean_stats.parquet.gzip": "df_wiki_train_stats",
    "train_features_rs.parquet.gzip": "df_wiki_train_rs",
    "train_features_nltk.parquet.gzip": "df_wiki_train_nltk",
    "train_features_aoa.parquet.gzip": "df_wiki_train_aoa",
    "train_features_crb.parquet.gzip": "df_wiki_train_crb",
    "train_features_w2v.parquet.gzip": "df_wiki_train_w2v",
   # "train_mNB_predict.parquet.gzip": "df_wiki_train_MNB",  # data leakage risk
   # "test_mNB_predict.parquet.gzip": "df_wiki_test_MNB",    # data leakage risk
}

# Load data into corresponding dataframes
for file, df_name in files.items():
    if df_name not in globals():
        globals()[df_name] = pd.read_parquet(ROOT_PATH + PATH_DATA_INT + file)


Not a Google Drive Environment. Loading local files.


In [11]:
# Combine dataframes into one
df_train = pd.DataFrame()
df_train = pd.concat([df_train, df_wiki_train_stats], axis=1)
df_train = pd.concat([df_train, df_wiki_train_rs], axis=1)
df_train = pd.concat([df_train, df_wiki_train_nltk], axis=1)
df_train = pd.concat([df_train, df_wiki_train_aoa], axis=1)
df_train = pd.concat([df_train, df_wiki_train_crb], axis=1)
df_train = pd.concat([df_train, df_wiki_train_w2v], axis=1)

if FASTRUN:
    df_train = fast_run_sampling(df_train, sample_fraction)


In [12]:
df_train.shape

(416768, 155)

In [13]:
df_rfecv_selected_feature = pd.read_csv(ROOT_PATH+PATH_DATA_INT+"RFECV10-RF-selected_features-table_04.02.csv")
rfecv_selected_feature = sorted(df_rfecv_selected_feature['Selected Features'].to_list())
ic(len(rfecv_selected_feature));


ic| len(rfecv_selected_feature): 125


# Sensitivity Analysis

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from scipy.stats import randint

In [15]:
k=10

skfold = StratifiedKFold(n_splits=k)
scaler = MinMaxScaler((0, 1))
rf = RandomForestClassifier(random_state=seed, n_jobs=-1)

n_iter = 25


In [16]:
X = df_train[rfecv_selected_feature].copy()
y = df_train["label"]

ic(X.shape);
ic(y.shape);


ic| X.shape: (416768, 125)
ic| y.shape: (416768,)


In [17]:
pipeline = Pipeline([
                    ('scaler', scaler), 
                    ('estimator', rf)
                    ])

In [18]:
# Define the hyperparameter search space
param_dist = {
              'estimator__n_estimators': randint(5, 500),
              'estimator__max_features': randint(0, 100),
              'estimator__max_depth': randint(0, 100),
              'estimator__min_samples_split': randint(2, 10),
              'estimator__min_samples_leaf': randint(1, 10)
             }

# Perform the hyperparameter search
search = RandomizedSearchCV(estimator=pipeline,
                            param_distributions=param_dist,
                            n_iter=n_iter,
                            scoring='accuracy',
                            cv=skfold,
                            n_jobs=-1,
                            random_state=seed)

search.fit(X, y)

# Print the best hyperparameters
print("Best hyperparameters: ", search.best_params_)


Best hyperparameters:  {'estimator__max_depth': 59, 'estimator__max_features': 40, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 331}


In [27]:
search.cv_results_["mean_test_score"]

search.cv_results_["std_test_score"]

search.cv_results_["mean_score_time"]

search.cv_results_["mean_fit_time"]



array([0.73488848, 0.74302249, 0.71568115, 0.74575303, 0.75246901,
       0.73939458, 0.74867312, 0.72813892, 0.73261383, 0.75321762,
       0.75054947, 0.7380605 , 0.75152843, 0.63766891, 0.6434779 ,
       0.75136767, 0.73819247, 0.74885548, 0.7438455 , 0.75435014,
       0.66934362, 0.75262736, 0.75315283, 0.74420061, 0.68354096])

array([0.0009973 , 0.00114829, 0.00131603, 0.00120605, 0.00154979,
       0.00148411, 0.0018079 , 0.00164904, 0.00100702, 0.00114234,
       0.0015364 , 0.00092222, 0.00145344, 0.00144069, 0.00159127,
       0.00133126, 0.00151422, 0.00111317, 0.0012489 , 0.00156571,
       0.00131129, 0.00141643, 0.00099956, 0.00106013, 0.00075907])

array([ 2.06082366,  6.3625674 ,  9.49608564, 10.02527974, 12.76802027,
        6.89824882,  7.1766185 ,  8.09431489, 12.34022608, 12.57180426,
        9.58306122,  8.01294725, 12.54948637,  7.41659291,  6.80507097,
       10.53697166, 11.22436299, 10.55448537, 11.00303383, 14.58849776,
        8.43114121, 14.47506413, 11.44273713, 11.18075054,  6.01389396])

array([ 580.56726921, 2069.2504174 ,  180.07497413, 1392.23373737,
       1788.83149972, 1281.47413025, 1403.85281863,  104.00519035,
       5948.92213826, 4265.45588238, 1340.63982732,  635.59311628,
       7321.79248741,   27.2039243 ,   94.99676318, 6498.49542034,
        755.3448981 , 2353.02785373, 2705.82998261, 3773.29391091,
       1240.11255939, 6363.44450829, 2513.89781775, 2163.56747272,
       1599.37806726])

In [28]:
search.cv_results_.keys()
send_push("Hyper finished")
send_push(search.best_params_)

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_estimator__max_depth', 'param_estimator__max_features', 'param_estimator__min_samples_leaf', 'param_estimator__min_samples_split', 'param_estimator__n_estimators', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'split5_test_score', 'split6_test_score', 'split7_test_score', 'split8_test_score', 'split9_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score'])

In [30]:
# Extract the mean test scores and standard deviations for each set of hyperparameters
means = search.cv_results_['mean_test_score']
stds = search.cv_results_['std_test_score']

# Extract the values of the n_estimators hyperparameter for each set of hyperparameters
max_depth = search.cv_results_['param_estimator__max_depth'].data
n_estimators = search.cv_results_['param_estimator__n_estimators'].data
max_features = search.cv_results_['param_estimator__max_features'].data
estimator__min = search.cv_results_['param_estimator__min_samples_split'].data
estimator__min_samples_leaf = search.cv_results_['param_estimator__min_samples_leaf'].data

In [35]:
# Create DataFrame
result_df = pd.DataFrame(
    {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'max_features': max_features,
    'estimator__min' :estimator__min,
    'estimator__min_samples_leaf': estimator__min_samples_leaf,
    'accuracy': means, 
    'ACC STD': stds,
    }
)


#result_df['lower_band'] = result_df['Accuracy'] - result_df['STD']
#result_df['upper_band'] = result_df['Accuracy'] + result_df['STD']

result_df

Unnamed: 0,n_estimators,max_depth,max_features,estimator__min,estimator__min_samples_leaf,accuracy,ACC STD
0,25,51,92,6,8,0.734888,0.000997
1,104,82,86,6,8,0.743022,0.001148
2,262,23,2,6,6,0.715681,0.001316
3,196,87,29,3,6,0.745753,0.001206
4,318,59,20,5,1,0.752469,0.00155
5,63,21,88,4,1,0.739395,0.001484
6,59,41,91,5,3,0.748673,0.001808
7,139,63,2,4,5,0.728139,0.001649
8,392,20,72,3,7,0.732614,0.001007
9,269,88,59,3,2,0.753218,0.001142


In [36]:
result_df = result_df.dropna()
result_df

Unnamed: 0,n_estimators,max_depth,max_features,estimator__min,estimator__min_samples_leaf,accuracy,ACC STD
0,25,51,92,6,8,0.734888,0.000997
1,104,82,86,6,8,0.743022,0.001148
2,262,23,2,6,6,0.715681,0.001316
3,196,87,29,3,6,0.745753,0.001206
4,318,59,20,5,1,0.752469,0.00155
5,63,21,88,4,1,0.739395,0.001484
6,59,41,91,5,3,0.748673,0.001808
7,139,63,2,4,5,0.728139,0.001649
8,392,20,72,3,7,0.732614,0.001007
9,269,88,59,3,2,0.753218,0.001142


In [37]:
result_df.sort_values('accuracy', inplace=True)
result_df

Unnamed: 0,n_estimators,max_depth,max_features,estimator__min,estimator__min_samples_leaf,accuracy,ACC STD
13,110,3,1,7,6,0.637669,0.001441
14,48,3,53,3,2,0.643478,0.001591
20,140,8,87,5,1,0.669344,0.001311
24,411,11,33,9,1,0.683541,0.000759
2,262,23,2,6,6,0.715681,0.001316
7,139,63,2,4,5,0.728139,0.001649
8,392,20,72,3,7,0.732614,0.001007
0,25,51,92,6,8,0.734888,0.000997
11,39,59,70,8,8,0.738061,0.000922
16,219,47,14,7,8,0.738192,0.001514


In [39]:
t_end = time.time()
calculate_duration(t_start, t_end);


1333.9 minutes


# Export

In [None]:
if not FASTRUN:
    result_df.to_csv(ROOT_PATH+PATH_DATA_REP+f'{today}_optimize_25_iter_table_{notebook_no}.csv', index=False);


# Watermark

In [None]:
%watermark

Last updated: 2023-02-24T08:12:26.358248+01:00

Python implementation: CPython
Python version       : 3.10.8
IPython version      : 8.8.0

Compiler    : MSC v.1929 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 158 Stepping 13, GenuineIntel
CPU cores   : 8
Architecture: 64bit



In [None]:
%watermark --iversions

numpy     : 1.24.1
seaborn   : 0.12.2
pandas    : 1.5.3
matplotlib: 3.6.3



-----


In [40]:
output_file = f'{ROOT_PATH}{PATH_DATA_HTML}{today}_08.01_model_optimization_hyperparameter_optimization.html'
!jupyter nbconvert --to html "08.01_model_optimization_hyperparameter_optimization.ipynb" --output {output_file}

[NbConvertApp] Converting notebook 09.01_optimize.ipynb to html
[NbConvertApp] Writing 643578 bytes to ..\reports\html\2023-02-26_09.01_optimize_25_iter.html
