v1.1

The objective of this notebook is to: 
  - calculate the accuracy score with the selected features from the **RFECV** step
  - predict the y with **cross_val_predict** and add it to the original df_train dataframe for the **failure analysis**
  - The outcome is primarily used for the **failure analysis**

# Pre-Checks

In [None]:
import time
t_start = time.time()


In [None]:
## Check for Google Drive Connectivity
try:
    from google.colab import drive
    drive.mount('/content/drive')
    google_env = True
except:
    print("Not a Goolge Drive Environment.")
    google_env = False


Not a Goolge Drive Environment.


In [None]:
# Use parameters to enable/disable the fastrun

FASTRUN = False
#FASTRUN = True

# Size of the fastrun dataframe
sample_fraction = 0.01
print("==="*40)
print("Fastrun enabled:", FASTRUN)
if FASTRUN: print("Sample size:",sample_fraction)
print("==="*40)


Fastrun enabled: False


# Setup Environment

## Install Modules

In [None]:
# Required to document module versions
#!pip install catboost
#!pip install xgboost
#!pip install icecream
#!pip install watermark
#!pip install fastparquet

## Import Modules

In [None]:
# Base libraries
import os
from datetime import date

# Scientific libraries
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(15,8)})
sns.set(font_scale=0.8)

# Helper libraries
from watermark import watermark
from icecream import ic
import gc # garbage collection to optimize memory usage: use "gc.collect()"
import warnings
warnings.filterwarnings('ignore')

# Pandas options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Load magic commands
%load_ext watermark


## Global Parameters

In [None]:
seed = 42
cpu_count = os.cpu_count()
cpu_count = cpu_count-2  # if we want to keep machine responsive when fitting the models, otherwise -1


## Global Helper Functions

In [None]:
import http.client
import urllib


def send_push(message):
    """Send push notifications to pushover service."""
    try:
        conn = http.client.HTTPSConnection("api.pushover.net:443")
        conn.request("POST", "/1/messages.json",
                     urllib.parse.urlencode({
                         "token": "ahs1q4mwpnxe3645zeaqzas69whq7a",  # ML Notifications Channel
                         "user": "u5vr1qkc9ghudg2ehuug153okeiz1d",
                         "message": message,
                     }), {"Content-type": "application/x-www-form-urlencoded"})

        conn.getresponse()

    except:
        print("There was a communication issue (pushover).")


In [None]:
def fast_run_sampling(df, sample_fraction):
    """Return a fraction of the dataset."""
    print("Labels before sampling:\n", df['label'].value_counts())
    df = df.groupby('label').sample(frac=sample_fraction, random_state=seed)
    # df = df.sample(n=sample_size, random_state=seed).reset_index(drop=True)
    print("Labels after sampling:\n", df['label'].value_counts())
    return df


In [None]:
def calculate_duration(t_start, t_end):
    total_runtime = t_end-t_start
    total_runtime_min = round((total_runtime/60),2)
    print(str(total_runtime_min)+" minutes")
    return total_runtime_min


# Load Data

In [None]:
# Load Data
if google_env:
    # Location for "shared with" people
    # create a shortcut of the shared folder in your Google Drive root folder
    ROOT_PATH = "/content/drive/MyDrive/SIADS696/Environment/"
else:
    ROOT_PATH = "../"
    print("Not a Google Drive Environment. Loading local files.")

PATH_DATA = "data/"
PATH_DATA_RAW = "data/raw/"
PATH_DATA_INT = "data/interim/"
PATH_DATA_PRO = "data/processed/"
PATH_DATA_MOD = "models/"
PATH_DATA_REP = "reports/"

# Dictionary to store file names and their corresponding dataframes
files = {
    "train_features_clean_stats.parquet.gzip": "df_wiki_train_stats",
    "test_features_clean_stats.parquet.gzip": "df_wiki_test_stats",
    "train_features_rs.parquet.gzip": "df_wiki_train_rs",
    "test_features_rs.parquet.gzip": "df_wiki_test_rs",
    "train_features_nltk.parquet.gzip": "df_wiki_train_nltk",
    "test_features_nltk.parquet.gzip": "df_wiki_test_nltk",
    "train_features_aoa.parquet.gzip": "df_wiki_train_aoa",
    "test_features_aoa.parquet.gzip": "df_wiki_test_aoa",
    "train_features_crb.parquet.gzip": "df_wiki_train_crb",
    "test_features_crb.parquet.gzip": "df_wiki_test_crb",
    "train_features_w2v.parquet.gzip": "df_wiki_train_w2v",
    "test_features_w2v.parquet.gzip": "df_wiki_test_w2v",
   # "train_mNB_predict.parquet.gzip": "df_wiki_train_MNB",  # data leakage risk
   # "test_mNB_predict.parquet.gzip": "df_wiki_test_MNB",    # data leakage risk
}

# Load data into corresponding dataframes
for file, df_name in files.items():
    if df_name not in globals():
        globals()[df_name] = pd.read_parquet(ROOT_PATH + PATH_DATA_INT + file)


Not a Google Drive Environment. Loading local files.


In [None]:
# Combine the training dataframes into a single dataframe
df_train = pd.DataFrame()
df_train = pd.concat([df_train,
                      df_wiki_train_stats,
                      df_wiki_train_rs,
                      df_wiki_train_nltk,
                      df_wiki_train_aoa,
                      df_wiki_train_crb,
                      df_wiki_train_w2v,
                      ], axis=1)

if FASTRUN:
    df_train = fast_run_sampling(df_train, sample_fraction)

ic(df_train.shape);


ic| df_train.shape: (416768, 155)


In [None]:
df_rfecv_selected_feature = pd.read_csv(ROOT_PATH+PATH_DATA_INT+"RFECV10-RF-selected_features-table_04.02.csv")
rfecv_selected_feature_list = sorted(df_rfecv_selected_feature['Selected Features'].to_list())
ic(len(rfecv_selected_feature_list));


ic| len(rfecv_selected_feature_list): 125


In [None]:
print(rfecv_selected_feature_list)


['aoa_aoa_bird_lem_max', 'aoa_aoa_bird_lem_mean', 'aoa_aoa_bird_lem_min', 'aoa_aoa_bird_lem_sum', 'aoa_aoa_bristol_lem_max', 'aoa_aoa_bristol_lem_mean', 'aoa_aoa_bristol_lem_min', 'aoa_aoa_bristol_lem_sum', 'aoa_aoa_cort_lem_max', 'aoa_aoa_cort_lem_mean', 'aoa_aoa_cort_lem_min', 'aoa_aoa_cort_lem_sum', 'aoa_aoa_kup_lem_max', 'aoa_aoa_kup_lem_mean', 'aoa_aoa_kup_lem_min', 'aoa_aoa_kup_lem_sum', 'aoa_aoa_kup_max', 'aoa_aoa_kup_mean', 'aoa_aoa_kup_min', 'aoa_aoa_kup_sum', 'aoa_aoa_schock_max', 'aoa_aoa_schock_mean', 'aoa_aoa_schock_min', 'aoa_aoa_schock_sum', 'aoa_dom_pos_subtlex_max', 'aoa_dom_pos_subtlex_mean', 'aoa_dom_pos_subtlex_min', 'aoa_dom_pos_subtlex_sum', 'aoa_freq_pm_max', 'aoa_freq_pm_mean', 'aoa_freq_pm_min', 'aoa_freq_pm_sum', 'aoa_perc_known_lem_mean', 'aoa_perc_known_lem_min', 'aoa_perc_known_lem_sum', 'aoa_perc_known_mean', 'aoa_perc_known_min', 'aoa_perc_known_sum', 'crb_concm_mean', 'crb_concm_sum', 'crb_concsd_mean', 'crb_concsd_sum', 'crb_perc_known_sum', 'crb_subtle

In [None]:
df_failure_analysis = df_train[rfecv_selected_feature_list].copy()  # required to add the y_pred and to perform the failure analysis
df_failure_analysis['label'] = df_train['label'].copy()
df_failure_analysis['original_text'] = df_train['original_text'].copy()
df_failure_analysis['cleaned_text'] = df_train['cleaned_text'].copy()
ic(df_failure_analysis.shape);

ic| df_failure_analysis.shape: (416768, 128)


# Predict

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score


In [None]:
X_columns_train = df_train.columns.to_list()
X_columns_train = list(set(X_columns_train) -
                       {"label", "original_text", "cleaned_text", "stats_language_code"})

X = df_train[X_columns_train].copy()
X_selected = df_train[rfecv_selected_feature_list].copy()
y = df_train["label"]

ic(X.shape);
ic(X_selected.shape);
ic(y.shape);


ic| X.shape: (416768, 151)
ic| X_selected.shape: (416768, 125)
ic| y.shape: (416768,)


In [None]:
k=10
skfold = StratifiedKFold(n_splits=k)
rf = RandomForestClassifier(random_state=seed, n_jobs=-1)


In [None]:
# Use/test cross_val_score with all features as a reference score
gc.collect()
scaler = MinMaxScaler((0,1))
X_scaled = scaler.fit_transform(X)

scores = cross_val_score(rf, X_scaled, y, cv=skfold)
mean_acc = scores.mean()

ic(mean_acc);


ic| mean_acc: 0.7490882214565235


In [None]:
# Use/test train_test_split with selected features
gc.collect()
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, y, test_size=0.3, stratify=y, random_state=seed)

scaler = MinMaxScaler((0, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

_ = rf.fit(X_train_scaled, y_train)
y_pred = rf.predict(X_test_scaled)
acc = accuracy_score(y_test, y_pred)

ic(acc);


ic| acc: 0.7112076205101135


In [None]:
# Use/test cross_val_score with selected features
gc.collect()
scaler = MinMaxScaler((0,1))
X_selected_scaled = scaler.fit_transform(X_selected)

scores = cross_val_score(rf, X_selected_scaled, y, cv=skfold)
mean_acc = scores.mean()

ic(mean_acc);


ic| mean_acc: 0.7487379075857312


In [None]:
# Use/test cross_val_predict with selected features
gc.collect()
scaler = MinMaxScaler((0,1))
X_selected_scaled = scaler.fit_transform(X_selected)

y_pred = cross_val_predict(rf, X_selected_scaled, y, cv=skfold)
acc = accuracy_score(y, y_pred)

ic(acc);


ic| acc: 0.7487355075245701


In [None]:
df_failure_analysis['y_pred'] = y_pred          # Use y_pred from cross_val_predict
df_failure_analysis['y_pred'].value_counts()    # Checking if not only one label was predicted


1    214229
0    202539
Name: y_pred, dtype: int64

In [None]:
t_end = time.time()
calculate_duration(t_start, t_end);


14.6 minutes


# Export

In [None]:
today = date.today()


In [None]:
if not FASTRUN:
    df_failure_analysis.to_csv(
        ROOT_PATH+PATH_DATA_REP+f"{today}_RF_base_model_RFECV_features_yresults_for_failure_analysis.csv", index=False)
    df_failure_analysis.to_parquet(
        ROOT_PATH+PATH_DATA_INT+f"RF_base_model_RFECV_features_results_for_failure_analysis.parquet.gzip", compression='gzip')


# Watermark

In [None]:
%watermark

Last updated: 2023-02-24T23:25:31.504991+01:00

Python implementation: CPython
Python version       : 3.9.0
IPython version      : 8.9.0

Compiler    : Clang 11.0.0 
OS          : Darwin
Release     : 22.3.0
Machine     : arm64
Processor   : arm
CPU cores   : 10
Architecture: 64bit



In [None]:
%watermark --iversions

seaborn   : 0.12.2
matplotlib: 3.6.3
numpy     : 1.23.5
pandas    : 1.5.3



***

In [None]:
output_file = f'{ROOT_PATH}{PATH_DATA_REP}/html/{today}_06.01_failure_analysis_predicting_y_with_RFECV_features.html'
!jupyter nbconvert --to html "06.01_failure_analysis_predicting_y_with_RFECV_features.ipynb" --output {output_file}

[NbConvertApp] Converting notebook 05.01_failure_analysis_RF_with_RFECV_features_GC.ipynb to html
[NbConvertApp] Writing 640846 bytes to ../reports/html/2023-02-24_05.01_failure_analysis_RF_with_RFECV_features_GC.html
