v1.2

The objective of this notebook is to: 
- get a baseline from various models
- identify the best performing model(s)


# Pre-Checks

In [1]:
## Check for Google Drive Connectivity
try:
    from google.colab import drive
    drive.mount('/content/drive')
    google_env = True
except:
    print("Not a Goolge Drive Environment.")
    google_env = False

Not a Goolge Drive Environment.


In [2]:
import time
t_start = time.time()

In [3]:
# Use parameters to enable/disable the fastrun
FASTRUN = False
#FASTRUN = True

# Size of the fastrun dataframe
sample_fraction = 0.25
print("==="*40)
print("Fastrun enabled:", FASTRUN)
if FASTRUN: print("Sample size:",sample_fraction)
print("==="*40)

Fastrun enabled: False


# Setup Environment

## Install Modules

In [4]:
# Required to document module versions
#!pip install catboost
#!pip install xgboost
#!pip install icecream
#!pip install watermark
#!pip install fastparquet

## Import Modules

In [5]:
# Base libraries
import os
from datetime import date

# Scientific libraries
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(15,8)})
sns.set(font_scale=0.8)

# Helper libraries
from watermark import watermark
from icecream import ic
import gc # garbage collection to optimize memory usage: use "gc.collect()"
import warnings
warnings.filterwarnings('ignore')

# Pandas options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Load magic commands
%load_ext watermark

## Global Parameters

In [6]:
# sklearn
seed = 42
cpu_count = os.cpu_count()
cpu_count = cpu_count-2  # to keep machine responsive when fitting the models
notebook_no = "03.01"

## Global Helper Functions

In [7]:
import http.client
import urllib


def send_push(message):
    """Send push notifications to pushover service."""
    try:
        conn = http.client.HTTPSConnection("api.pushover.net:443")
        conn.request("POST", "/1/messages.json",
                     urllib.parse.urlencode({
                         "token": "ahs1q4mwpnxe3645zeaqzas69whq7a",  # ML Notifications Channel
                         "user": "u5vr1qkc9ghudg2ehuug153okeiz1d",
                         "message": message,
                     }), {"Content-type": "application/x-www-form-urlencoded"})

        conn.getresponse()

    except:
        print("There was a communication issue (pushover).")


In [8]:
def fast_run_sampling(df, sample_fraction):
    print("Labels before sampling:\n", df['label'].value_counts())

    # Group by label to ensure to get a balanced sample fraction
    df = df.groupby('label').sample(frac=sample_fraction, random_state=seed)
    print("Labels after sampling:\n", df['label'].value_counts())

    return df


In [9]:
def calculate_duration(t_start, t_end):
    total_runtime = t_end-t_start
    total_runtime_min = round((total_runtime/60), 2)
    print(str(total_runtime_min)+" minutes")

    return total_runtime_min


# Load Data

In [10]:
# Load Data
if google_env:
    # Location for "shared with" people
    # create a shortcut of the shared folder in your Google Drive root folder
    ROOT_PATH = "/content/drive/MyDrive/SIADS696/Environment/"
else:
    ROOT_PATH = "../"
    print("Not a Google Drive Environment. Loading local files.")

PATH_DATA = "data/"
PATH_DATA_RAW = "data/raw/"
PATH_DATA_INT = "data/interim/"
PATH_DATA_PRO = "data/processed/"
PATH_DATA_MOD = "models/"
PATH_DATA_REP = "reports/"

# Dictionary to store file names and their corresponding dataframes
files = {
    "train_features_clean_stats.parquet.gzip": "df_wiki_train_stats",
    "test_features_clean_stats.parquet.gzip": "df_wiki_test_stats",
    "train_features_rs.parquet.gzip": "df_wiki_train_rs",
    "test_features_rs.parquet.gzip": "df_wiki_test_rs",
    "train_features_nltk.parquet.gzip": "df_wiki_train_nltk",
    "test_features_nltk.parquet.gzip": "df_wiki_test_nltk",
    "train_features_aoa.parquet.gzip": "df_wiki_train_aoa",
    "test_features_aoa.parquet.gzip": "df_wiki_test_aoa",
    "train_features_crb.parquet.gzip": "df_wiki_train_crb",
    "test_features_crb.parquet.gzip": "df_wiki_test_crb",
    "train_features_w2v.parquet.gzip": "df_wiki_train_w2v",
    "test_features_w2v.parquet.gzip": "df_wiki_test_w2v",
   # "train_mNB_predict.parquet.gzip": "df_wiki_train_MNB",  # data leakage risk
   # "test_mNB_predict.parquet.gzip": "df_wiki_test_MNB",    # data leakage risk
}

# Load data into corresponding dataframes
for file, df_name in files.items():
    if df_name not in globals():
        globals()[df_name] = pd.read_parquet(ROOT_PATH + PATH_DATA_INT + file)


Not a Google Drive Environment. Loading local files.


In [11]:
# Combine dataframes into one
df_train = pd.DataFrame()
df_train = pd.concat([df_train, df_wiki_train_stats], axis=1)
df_train = pd.concat([df_train, df_wiki_train_rs], axis=1)
df_train = pd.concat([df_train, df_wiki_train_nltk], axis=1)
df_train = pd.concat([df_train, df_wiki_train_aoa], axis=1)
df_train = pd.concat([df_train, df_wiki_train_crb], axis=1)
df_train = pd.concat([df_train, df_wiki_train_w2v], axis=1)

if FASTRUN:
    df_train = fast_run_sampling(df_train, sample_fraction)


In [12]:
ic(df_train.shape);

ic| df_train.shape: (416768, 155)


In [13]:
# Test: Keep only the rows with at least 140 non-NA values.
# Did not improve the RF accuracy score: 0.7421853156160447
# Most-likely there are too many incorrect labels in the raw source file

# df_train_nona = df_train.copy()
# df_train_nona = df_train_nona.replace(-1, np.nan)
# df_train_nona = df_train_nona.dropna(thresh=140)
# ic(df_train_nona.shape);
# df_train = df_train_nona.copy()
# df_train = df_train.replace(np.nan,-1)

# 4.0 Base Models

In [14]:
# Model selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Scaling and pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# Linear models
from sklearn.linear_model import LogisticRegression

# Linear/Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier 

# Tree models
from sklearn.tree import DecisionTreeClassifier

# NB models
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

# Neighbors models
from sklearn.neighbors import KNeighborsClassifier

# Ensemble models
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [15]:
model_pipeline = {
    'Random Forerst': RandomForestClassifier(random_state=seed, n_jobs=cpu_count),
    #'Extra Trees': ExtraTreesClassifier(),
    #'XGBoost': XGBClassifier(),
    #'CatBoost': CatBoostClassifier(logging_level='Silent', random_seed=seed),
    'Logistic Regression': LogisticRegression(random_state=seed, n_jobs=cpu_count, max_iter=3000),
    'Gaussian NB': GaussianNB(),
    'Multinomial NB': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'KNeighbors': KNeighborsClassifier(),
    #'SGDC': SGDClassifier(),
}


In [16]:
X_columns_train = df_train.columns.to_list()
X_columns_train = list(set(X_columns_train) -
                       {"label", "original_text", "cleaned_text", "stats_language_code"})

X = df_train[X_columns_train].copy()
y = df_train["label"]

ic(X.shape);
ic(y.shape);


ic| X.shape: (416768, 151)
ic| y.shape: (416768,)


In [17]:
k=10
skfold = StratifiedKFold(n_splits=k)
rf = RandomForestClassifier(random_state=seed, n_jobs=-1)
scaler = MinMaxScaler((0, 1))


In [18]:
%%time
gc.collect()

acc_list = []
roc_auc_list = []
recall_list = []
precision_list = []
f1_list = []
std_acc_list = []
lower_bound_list = []
upper_bound_list = []
conf_int_list = []

scoring_list = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

for name, model in model_pipeline.items():
    gc.collect()
    print("Cross-validating:", name)

    pipeline = Pipeline([('scaling', scaler), ('estimator', model)])
    cv_results = cross_validate(pipeline, X, y, cv=skfold, scoring=scoring_list, 
                                return_estimator =True, error_score='raise', n_jobs=-1)

    acc_list.append(np.mean(cv_results['test_accuracy']))

    roc_auc_list.append(np.mean(cv_results['test_roc_auc']))
    recall_list.append(np.mean(cv_results['test_recall']))
    precision_list.append(np.mean(cv_results['test_precision']))
    f1_list.append(np.mean(cv_results['test_f1']))

    # Calculate mean and standard deviation of accuracy scores
    mean_acc = np.mean(cv_results['test_accuracy'])
    std_acc = np.std(cv_results['test_accuracy'])
    std_acc_list.append(std_acc)

    # Calculate 95% confidence interval of accuracy scores
    conf_int = 1.96 * std_acc / np.sqrt(len(cv_results['test_accuracy']))
    conf_int_list.append(conf_int)

    lower_bound = mean_acc - conf_int
    upper_bound = mean_acc + conf_int

    lower_bound_list.append(lower_bound)
    upper_bound_list.append(upper_bound)

    if not FASTRUN:
        message = (
            f"Cross-validation accuracy score base model {name}:", (np.mean(cv_results['test_accuracy'])))
        send_push(message)

    print(np.mean(cv_results['test_accuracy']));


Cross-validating: Random Forerst
0.7488842722662712
Cross-validating: Logistic Regression
0.6659556366675254
Cross-validating: Gaussian NB
There was a communication issue (pushover).
0.6199348256420776
Cross-validating: Multinomial NB
0.6132572455172494
Cross-validating: Decision Tree
0.6940456113270318
Cross-validating: KNeighbors
0.6673448992712002
CPU times: user 3.69 s, sys: 15.3 s, total: 19 s
Wall time: 18min 55s


previus run (with rounded stats_ and rs_ features):
- Cross-validating: Random Forerst: 0.7488626809609666
- Cross-validating: Extra Trees: 0.7482388326232383
- Cross-validating: XGBoost: 0.6963466506894254
- Cross-validating: CatBoost: 0.7025179460647804
- Cross-validating: Logistic Regression: 0.6655021483839874
- Cross-validating: Gaussian NB: 0.6199996100938174
- Cross-validating: Multinomial NB: 0.6134731927109776
- Cross-validating: Decision Tree: 0.694031224396862
- CPU times: user 7.55 s, sys: 33 s, total: 40.5 s
- Wall time: 36min 29s

In [19]:
if not FASTRUN:
    send_push("Cross-validation (with CV) base models finished.")


There was a communication issue (pushover).


In [20]:
def style_highlight_max(s):
    if s.dtype == np.object:
        is_max = [False for _ in range(s.shape[0])]
    else:
        is_max = s == s.max()
    return ['background: lightblue' if cell else '' for cell in is_max]


In [21]:
result_df = pd.DataFrame({'Model': model_pipeline.keys(), 
                          'Accuracy': acc_list,
                          'Acc STD': std_acc_list,
                          'Acc CI': conf_int_list,
                          'Recall': recall_list,
                          'Precision': precision_list, 'F1': f1_list, 'ROC AUC': roc_auc_list})
result_df.style\
    .set_properties(**{'text-align': 'center'})\
    .apply(style_highlight_max)\
    .format(
        {
            'Accuracy': '{:,.4f}',
            'Acc STD': '{:,.4f}',
            'Acc CI': '{:,.4f}',
            'Recall': '{:,.4f}',
            'Precision': '{:,.4f}',
            'F1': '{:,.4f}',
            'ROC AUC': '{:,.4f}',
        })\
    .set_caption(f'<br>Cross-Validation Scores with CV={k}<br>Fastrun enabled:{FASTRUN}')\
    .set_table_styles([{
        'selector': 'caption',
        'props': 'caption-side: bottom; font-size:0.9em;'}], overwrite=False)\
    .hide_index()



Model,Accuracy,Acc STD,Acc CI,Recall,Precision,F1,ROC AUC
Random Forerst,0.7489,0.0011,0.0007,0.7636,0.7418,0.7525,0.8437
Logistic Regression,0.666,0.0015,0.001,0.6734,0.6635,0.6684,0.7207
Gaussian NB,0.6199,0.0017,0.0011,0.7476,0.5955,0.663,0.6655
Multinomial NB,0.6133,0.0017,0.001,0.5418,0.6322,0.5835,0.669
Decision Tree,0.694,0.0014,0.0009,0.6908,0.6953,0.693,0.6954
KNeighbors,0.6673,0.001,0.0006,0.6919,0.6595,0.6753,0.7324


## Voting

Voting Classifier supports two types of votings.
- **Hard Voting**: In hard voting, the predicted output class is a class with the highest majority of votes i.e the class which had the highest probability of being predicted by each of the classifiers. Suppose three classifiers predicted the output class(A, A, B), so here the majority predicted A as output. Hence A will be the final prediction.

- **Soft Voting**: In soft voting, the output class is the prediction based on the average of probability given to that class. Suppose given some input to three models, the prediction probability for class A = (0.30, 0.47, 0.53) and B = (0.20, 0.32, 0.40). So the average for class A is 0.4333 and B is 0.3067, the winner is clearly class A because it had the highest probability averaged by each classifier.

In [22]:
# Classifiers for voting classifier
gc.collect()
clf1 = ExtraTreesClassifier()
clf2 = CatBoostClassifier(logging_level='Silent')
clf3 = RandomForestClassifier(random_state=seed, n_jobs=cpu_count)
clf4 = XGBClassifier()

eclf1 = VotingClassifier(estimators=[('ExTrees', clf1), ('CatBoost', clf2), ('RF', clf3), ('XGB', clf4)], voting='soft')

53

In [23]:
%%time
gc.collect()

acc_list = []
roc_auc_list = []
recall_list = []
precision_list = []
f1_list = []
std_acc_list = []
lower_bound_list = []
upper_bound_list = []
conf_int_list = []


pipeline = Pipeline([('scaling', scaler), ('estimator', eclf1)])
cv_results = cross_validate(pipeline, X, y, cv=skfold, scoring=scoring_list, 
                                return_estimator =True, error_score='raise', n_jobs=-1)

acc_list.append(np.mean(cv_results['test_accuracy']))

roc_auc_list.append(np.mean(cv_results['test_roc_auc']))
recall_list.append(np.mean(cv_results['test_recall']))
precision_list.append(np.mean(cv_results['test_precision']))
f1_list.append(np.mean(cv_results['test_f1']))

# Calculate mean and standard deviation of accuracy scores
mean_acc = np.mean(cv_results['test_accuracy'])
std_acc = np.std(cv_results['test_accuracy'])
std_acc_list.append(std_acc)

# Calculate 95% confidence interval of accuracy scores
conf_int = 1.96 * std_acc / np.sqrt(len(cv_results['test_accuracy']))
conf_int_list.append(conf_int)

lower_bound = mean_acc - conf_int
upper_bound = mean_acc + conf_int

lower_bound_list.append(lower_bound)
upper_bound_list.append(upper_bound)

KeyboardInterrupt: 

In [None]:
vc_result_df = pd.DataFrame({'Model': "Voting CLF", 
                          'Accuracy': acc_list,
                          'Acc STD': std_acc_list,
                          'Acc CI': conf_int_list,
                          'Recall': recall_list,
                          'Precision': precision_list, 'F1': f1_list, 'ROC AUC': roc_auc_list})
vc_result_df.style\
    .set_properties(**{'text-align': 'center'})\
    .format(
        {
            'Accuracy': '{:,.4f}',
            'Acc STD': '{:,.4f}',
            'Acc CI': '{:,.4f}',
            'Recall': '{:,.4f}',
            'Precision': '{:,.4f}',
            'F1': '{:,.4f}',
            'ROC AUC': '{:,.4f}',
        })\
    .set_caption(f'<br>Cross-Validation Scores with CV={k}<br>Fastrun enabled:{FASTRUN}')\
    .set_table_styles([{
        'selector': 'caption',
        'props': 'caption-side: bottom; font-size:0.9em;'}], overwrite=False)\
    .hide_index()

Model,Accuracy,Acc STD,Acc CI,Recall,Precision,F1,ROC AUC
Voting CLF,0.7587,0.0015,0.0009,0.7835,0.7465,0.7645,0.8454


In [None]:
t_end = time.time()
calculate_duration(t_start, t_end);


55.68 minutes


# Export

In [None]:
export_df = pd.concat([result_df, vc_result_df], axis=0)
export_df

Unnamed: 0,Model,Accuracy,Acc STD,Acc CI,Recall,Precision,F1,ROC AUC
0,Random Forerst,0.749127,0.001709,0.001059,0.763446,0.742195,0.752668,0.84361
1,Extra Trees,0.747562,0.002131,0.001321,0.762103,0.74057,0.75118,0.843518
2,XGBoost,0.696596,0.001062,0.000658,0.729019,0.684629,0.706122,0.772491
3,CatBoost,0.702851,0.001405,0.000871,0.734927,0.690623,0.712083,0.77915
4,Logistic Regression,0.665975,0.001532,0.000949,0.673545,0.663503,0.668481,0.720726
5,Gaussian NB,0.620009,0.001737,0.001077,0.747663,0.595614,0.663016,0.66556
6,Multinomial NB,0.613305,0.001686,0.001045,0.541817,0.632206,0.583529,0.66903
7,Decision Tree,0.693388,0.001602,0.000993,0.689794,0.69479,0.692281,0.694803
0,Voting CLF,0.758712,0.001496,0.000927,0.783481,0.746503,0.764544,0.845445


In [None]:
today = date.today()

if not FASTRUN:
    export_df.to_csv(
        ROOT_PATH+PATH_DATA_REP+f"{today}_base_models_CV{k}_scores_table_{notebook_no}.csv", index=False)
   

# Watermark

In [None]:
%watermark

Last updated: 2023-02-20T20:31:34.156063+01:00

Python implementation: CPython
Python version       : 3.9.0
IPython version      : 8.9.0

Compiler    : Clang 11.0.0 
OS          : Darwin
Release     : 22.3.0
Machine     : arm64
Processor   : arm
CPU cores   : 10
Architecture: 64bit



In [None]:
%watermark --iversions

seaborn   : 0.12.2
numpy     : 1.23.5
pandas    : 1.5.3
matplotlib: 3.6.3



In [None]:
output_file = f'{ROOT_PATH}{PATH_DATA_REP}/html/{today}_03.01_modeling_base_models_with_CV_GC_03.01.html'
!jupyter nbconvert --to html "03.01_modeling_base_models_with_CV.ipynb" --output {output_file}
#!jupyter nbconvert --to html --TemplateExporter.exclude_input=True --output exported_notebook.html notebook.ipynb

[NbConvertApp] Converting notebook 03.01_modeling_base_models_with_CV_GC.ipynb to html
[NbConvertApp] Writing 672161 bytes to ../reports/html/2023-02-20_03.01_modeling_base_models_with_CV_GC_03.01.html
