v1.0

The objective of this notebook is to:
- systematically removing individual features or components from the base models to understand their impact on the model's overall performance
  

# Pre-checks

In [66]:
## Check for Google Drive Connectivity
try:
    from google.colab import drive
    drive.mount('/content/drive')
    google_env = True
except:
    print("Not a Goolge Drive Environment.")
    google_env = False

Not a Goolge Drive Environment.


In [67]:
import time
t_start = time.time()

In [68]:
# Use parameters to enable/disable the fastrun

FASTRUN = False
#FASTRUN = True

# Size of the fastrun dataframe
sample_fraction = 0.001
print("==="*40)
print("Fastrun enabled:", FASTRUN)
if FASTRUN: print("Sample size:",sample_fraction)
print("==="*40)


Fastrun enabled: False


# Setup Environment

## Install Modules

In [69]:
#!pip install watermark
#!pip install icecream

## Import Modules

In [70]:
# Base libraries
import os
from datetime import date

# Scientific libraries
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12,8)

sns.set(rc={'figure.figsize':(12,8)})
sns.set(font_scale=0.8)

# Helper libraries
from watermark import watermark
from icecream import ic
import gc # garbage collection to optimize memory usage: use "gc.collect()"
import warnings
warnings.filterwarnings('ignore')

# Pandas options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Load magic commands
%load_ext watermark

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


## Global Parameters

In [71]:
seed = 42
cpu_count = os.cpu_count()
cpu_count = cpu_count-2  # to keep machine responsive when fitting the models
notebook_no = "04.04"
today = date.today()

## Global Helper Functions

In [72]:
import http.client
import urllib


def send_push(message):
    """Send push notifications to pushover service."""
    try:
        conn = http.client.HTTPSConnection("api.pushover.net:443")
        conn.request("POST", "/1/messages.json",
                     urllib.parse.urlencode({
                         "token": "ahs1q4mwpnxe3645zeaqzas69whq7a",  # ML Notifications Channel
                         "user": "u5vr1qkc9ghudg2ehuug153okeiz1d",
                         "message": message,
                     }), {"Content-type": "application/x-www-form-urlencoded"})

        conn.getresponse()

    except:
        print("There was a communication issue (pushover).")


In [73]:
def fast_run_sampling(df, sample_fraction):
    """Return a fraction of the dataset."""
    print("Labels before sampling:\n", df['label'].value_counts())
    df = df.groupby('label').sample(frac=sample_fraction, random_state=seed) 
    #df = df.sample(n=sample_size, random_state=seed).reset_index(drop=True)
    print("Labels after sampling:\n", df['label'].value_counts())
    return df
    

In [74]:
def calculate_duration(t_start, t_end):
    """Return run time in minutes."""
    total_runtime = t_end-t_start
    total_runtime_min = round((total_runtime/60), 2)
    print(str(total_runtime_min)+" minutes")

    return total_runtime_min


# 3.0 Load Data

In [75]:
# Load Data
if google_env:
    # Location for "shared with" people
    # create a shortcut of the shared folder in your Google Drive root folder
    ROOT_PATH = "/content/drive/MyDrive/SIADS696/Environment/"
else:
    ROOT_PATH = "../"
    print("Not a Google Drive Environment. Loading local files.")

PATH_DATA = "data/"
PATH_DATA_RAW = "data/raw/"
PATH_DATA_INT = "data/interim/"
PATH_DATA_PRO = "data/processed/"
PATH_DATA_MOD = "models/"
PATH_DATA_REP = "reports/"
PATH_DATA_FIG = "reports/figures/"
PATH_DATA_HTML = "reports/html/"

df_wiki_train_stats = pd.read_parquet(ROOT_PATH + PATH_DATA_INT + "train_features_clean_stats.parquet.gzip")
df_wiki_train_rs = pd.read_parquet(ROOT_PATH + PATH_DATA_INT+"train_features_rs.parquet.gzip")
df_wiki_train_nltk = pd.read_parquet(ROOT_PATH + PATH_DATA_INT+"train_features_nltk.parquet.gzip")
df_wiki_train_aoa = pd.read_parquet(ROOT_PATH + PATH_DATA_INT+"train_features_aoa.parquet.gzip")
df_wiki_train_crb = pd.read_parquet(ROOT_PATH + PATH_DATA_INT+"train_features_crb.parquet.gzip")
df_wiki_train_w2v = pd.read_parquet(ROOT_PATH + PATH_DATA_INT+"train_features_w2v.parquet.gzip")


Not a Google Drive Environment. Loading local files.


# Feature Ablation

In [76]:
# Model selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

# Scaling and pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# Linear models
from sklearn.linear_model import LogisticRegression

# Linear/Stochastic Gradient Descent
from sklearn.linear_model import SGDClassifier 

# Tree models
from sklearn.tree import DecisionTreeClassifier

# NB models
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

# Neighbors models
from sklearn.neighbors import KNeighborsClassifier

# Ensemble models
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [77]:
model_pipeline = {
    'Random Forerst': RandomForestClassifier(random_state=seed, n_jobs=cpu_count),
    # 'Extra Trees': ExtraTreesClassifier(),
    # 'XGBoost': XGBClassifier(),
    # 'CatBoost': CatBoostClassifier(logging_level='Silent', random_seed=seed),
    # 'Logistic Regression': LogisticRegression(random_state=seed, n_jobs=cpu_count, max_iter=1000),
    # 'Gaussian NB': GaussianNB(),
    # 'Multinomial NB': MultinomialNB(),
    # 'Decision Tree': DecisionTreeClassifier(),
    # 'KNeighbors': KNeighborsClassifier(),
    # 'SGDC': SGDClassifier(),
}


In [78]:
def calculate_acc(model_pipeline, X, y):
    gc.collect()
    acc_list = []
    scoring_list = ['accuracy']

    for name, model in model_pipeline.items():
        gc.collect()
        print("Cross-validating:", name)

        pipeline = Pipeline([('scaling', scaler), ('estimator', model)])
        cv_results = cross_validate(pipeline, X, y, cv=skfold, scoring=scoring_list, 
                                    return_estimator =True, error_score='raise', n_jobs=-1)

        acc_list.append(np.mean(cv_results['test_accuracy']))
        print(np.mean(cv_results['test_accuracy']));
        
        if not FASTRUN:
            message = (
                f"Cross-validation accuracy score base model {name}:", (np.mean(cv_results['test_accuracy'])))
            send_push(message)

        return np.mean(cv_results['test_accuracy'])

In [79]:
k=10
skfold = StratifiedKFold(n_splits=k)
rf = RandomForestClassifier(random_state=seed, n_jobs=-1)
scaler = MinMaxScaler((0, 1))


In [80]:
acc_list = []
features_list = []
dataframes_list = []


In [81]:
dataset_df_list = [
    df_wiki_train_stats,
    df_wiki_train_rs,
    df_wiki_train_nltk,
    df_wiki_train_aoa,
    df_wiki_train_crb,
    df_wiki_train_w2v,
]

dataset_list = [
    "df_wiki_train_stats",
    "df_wiki_train_rs",
    "df_wiki_train_nltk",
    "df_wiki_train_aoa",
    "df_wiki_train_crb",
    "df_wiki_train_w2v",
    ]

In [82]:
df_train = pd.concat(dataset_df_list, axis=1)
df_train.shape

(416768, 155)

In [83]:
X_columns_train = df_train.columns.to_list()
X_columns_train = list(set(X_columns_train) - {"label", "original_text", "cleaned_text", "stats_language_code"})

X = df_train[X_columns_train].copy()
y = df_wiki_train_stats["label"]

print(X.shape);
print(y.shape);

(416768, 151)
(416768,)


In [84]:
result = calculate_acc(model_pipeline, X, y)

Cross-validating: Random Forerst
0.7488890715367489


In [85]:
dataframes_list.append(dataset_list)
features_list.append(len(X_columns_train))
acc_list.append(result)


In [86]:
# Loop through the list and exclude one and create a list; this is used to visualize the selected dataframes later
# I was not able to realize this with a dictionary, therefore two different lists are used

concatenated_df = pd.DataFrame()
list_of_dataset_list = []
for i, dataset in enumerate(dataset_list):  
    # Exclude the i-th dataset
    excluded_dataset_list = dataset_list[:i] + dataset_list[i+1:]
    list_of_dataset_list.append(excluded_dataset_list)
ic(list_of_dataset_list);

ic| list_of_dataset_list: [['df_wiki_train_rs',
                            'df_wiki_train_nltk',
                            'df_wiki_train_aoa',
                            'df_wiki_train_crb',
                            'df_wiki_train_w2v'],
                           ['df_wiki_train_stats',
                            'df_wiki_train_nltk',
                            'df_wiki_train_aoa',
                            'df_wiki_train_crb',
                            'df_wiki_train_w2v'],
                           ['df_wiki_train_stats',
                            'df_wiki_train_rs',
                            'df_wiki_train_aoa',
                            'df_wiki_train_crb',
                            'df_wiki_train_w2v'],
                           ['df_wiki_train_stats',
                            'df_wiki_train_rs',
                            'df_wiki_train_nltk',
                            'df_wiki_train_crb',
                            'df_wiki_train_w2v'],
          

In [87]:
# Loop through the list and exclude one dataframe each time
for i in range(len(dataset_df_list)):
    df_train = pd.DataFrame()
    # Exclude the i-th dataset
    excluded_dataset_list = dataset_df_list[:i] + dataset_df_list[i+1:]
    dataframes_list.append(list_of_dataset_list[i])
    print(list_of_dataset_list[i])

    # Concatenate the remaining datasets along axis 1 (i.e., horizontally)
    df_train = pd.concat(excluded_dataset_list, axis=1)

    X_columns_train = df_train.columns.to_list()
    X_columns_train = list(set(X_columns_train) - {"label", "original_text", "cleaned_text", "stats_language_code"})

    X = df_train[X_columns_train].copy()
    y = df_wiki_train_stats["label"]

    #ic(X_columns_train);
    print(len(X_columns_train))
    print(X.shape);
    print(y.shape);

    features_list.append(len(X_columns_train))
    acc_list.append(calculate_acc(model_pipeline, X, y))

    print()


['df_wiki_train_rs', 'df_wiki_train_nltk', 'df_wiki_train_aoa', 'df_wiki_train_crb', 'df_wiki_train_w2v']
118
(416768, 118)
(416768,)
Cross-validating: Random Forerst
0.7457218395613874

['df_wiki_train_stats', 'df_wiki_train_nltk', 'df_wiki_train_aoa', 'df_wiki_train_crb', 'df_wiki_train_w2v']
120
(416768, 120)
(416768,)
Cross-validating: Random Forerst
0.7516675985864287

['df_wiki_train_stats', 'df_wiki_train_rs', 'df_wiki_train_aoa', 'df_wiki_train_crb', 'df_wiki_train_w2v']
115
(416768, 115)
(416768,)
Cross-validating: Random Forerst
0.7460049713026768

['df_wiki_train_stats', 'df_wiki_train_rs', 'df_wiki_train_nltk', 'df_wiki_train_crb', 'df_wiki_train_w2v']
111
(416768, 111)
(416768,)
Cross-validating: Random Forerst
0.7411821454803826

['df_wiki_train_stats', 'df_wiki_train_rs', 'df_wiki_train_nltk', 'df_wiki_train_aoa', 'df_wiki_train_w2v']
143
(416768, 143)
(416768,)
Cross-validating: Random Forerst
0.7489610488490591

['df_wiki_train_stats', 'df_wiki_train_rs', 'df_wiki_trai

In [107]:
df_results = pd.DataFrame({
    'feature_groups': dataframes_list,
    'features': features_list,
    'accuracy': acc_list
})

df_results

Unnamed: 0,feature_groups,features,accuracy
0,"[df_wiki_train_stats, df_wiki_train_rs, df_wiki_train_nltk, df_wiki_train_aoa, df_wiki_train_crb, df_wiki_train_w2v]",151,0.748889
1,"[df_wiki_train_rs, df_wiki_train_nltk, df_wiki_train_aoa, df_wiki_train_crb, df_wiki_train_w2v]",118,0.745722
2,"[df_wiki_train_stats, df_wiki_train_nltk, df_wiki_train_aoa, df_wiki_train_crb, df_wiki_train_w2v]",120,0.751668
3,"[df_wiki_train_stats, df_wiki_train_rs, df_wiki_train_aoa, df_wiki_train_crb, df_wiki_train_w2v]",115,0.746005
4,"[df_wiki_train_stats, df_wiki_train_rs, df_wiki_train_nltk, df_wiki_train_crb, df_wiki_train_w2v]",111,0.741182
5,"[df_wiki_train_stats, df_wiki_train_rs, df_wiki_train_nltk, df_wiki_train_aoa, df_wiki_train_w2v]",143,0.748961
6,"[df_wiki_train_stats, df_wiki_train_rs, df_wiki_train_nltk, df_wiki_train_aoa, df_wiki_train_crb]",148,0.746991


In [108]:
t_end = time.time()
calculate_duration(t_start, t_end);


36.21 minutes


# Export

In [109]:
if not FASTRUN:
    df_results.to_csv(ROOT_PATH+PATH_DATA_REP+f'{today}_feature_ablation-table_{notebook_no}.csv')


# Watermark

In [110]:
%watermark

Last updated: 2023-02-21T00:57:08.446144+01:00

Python implementation: CPython
Python version       : 3.9.0
IPython version      : 8.9.0

Compiler    : Clang 11.0.0 
OS          : Darwin
Release     : 22.3.0
Machine     : arm64
Processor   : arm
CPU cores   : 10
Architecture: 64bit



In [111]:
%watermark --iversions

seaborn   : 0.12.2
pandas    : 1.5.3
matplotlib: 3.6.3
numpy     : 1.23.5



-----


In [112]:
output_file = f'{ROOT_PATH}{PATH_DATA_HTML}{today}_04.03_feature_selection_feature_ablation.html'
!jupyter nbconvert --to html "04.03_feature_selection_feature_ablation.ipynb" --output {output_file}

[NbConvertApp] Converting notebook 04.04_feature_ablation_GC.ipynb to html
[NbConvertApp] Writing 632773 bytes to ../reports/html/2023-02-21_04.04_feature_ablation_GC.html
