# 599 Capstone Project

The notebook is for text data preprocessing.

## Globally import libraries and set display parameters

Libraries needed mostly pertain to dataframe manipulation for data preprocessing.

In [1]:
from collections import defaultdict, Counter
import datetime as dt
import emoji
from icecream import ic
from IPython.display import display_html 
import itertools
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import regex as rex
import shutil
from string import punctuation
import time
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords

import spacy

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer, \
CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import make_scorer, f1_score, classification_report, \
confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.pipeline import make_pipeline, Pipeline

import textacy.preprocessing as tprep
from textacy.extract import keyword_in_context

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

Set global parameters.

In [2]:
global_start_time = time.perf_counter()

random_state = 1699
random.seed(random_state)

# Set pandas global options
pd.options.display.max_rows = 23
pd.options.display.precision = 4
np.set_printoptions(suppress=True,
                    precision=4)

%matplotlib inline

# Set tqdm package progress bar
tqdm.pandas(ncols=50)

dply_rng_end01 = 0
dply_rng_end02 = 2

to_csv_flag = True

## Upload data from CSV

Establish working directories for saving dataframes as CSV files.

In [3]:
'''Dir nav citation:
https://softhints.com/python-change-directory-parent/
'''
curr_dir = os.path.abspath(os.curdir)
ic(curr_dir)
os.chdir("..")
up1_dir = os.path.abspath(os.curdir)
ic(up1_dir)
ic()

ic| curr_dir: 'C:\\Users\\acarr\\Documents\\GitHub\\599_team_project\\deliverables'
ic| up1_dir: 'C:\\Users\\acarr\\Documents\\GitHub\\599_team_project'
ic| 652109981.py:9 in <module> at 13:51:36.386


Get current date/time to append to file name string.

In [4]:
today = dt.datetime.today()
today= str(today)
today = today.replace(':', '-').replace('.', '').replace(' ', '_')
ic(today)
ic(type(today))
ic()

ic| today: '2023-07-27_13-51-36424666'
ic| type(today): <class 'str'>
ic| 3990198718.py:6 in <module> at 13:51:36.496


Establish full file name path.

In [5]:
# change `data_location` to the location of the folder on your machine.
data_large_location = 'data_large'
data_location = 'data'
ref_docs_location = 'ref_docs'

file_in_name01 = 'capstone_master_tm_X01_v1.csv'
file_in_name02 = 'capstone_master_tm_y01_v1.npy'

file_in_path01 = os.path.join(up1_dir, data_large_location, file_in_name01)
file_in_path02 = os.path.join(up1_dir, data_location, file_in_name02)

print(f'CSV file in 1 path: {file_in_path01}')
print(f'NP array file in 2 path: {file_in_path02}')

CSV file in 1 path: C:\Users\acarr\Documents\GitHub\599_team_project\data_large\capstone_master_tm_X01_v1.csv
NP array file in 2 path: C:\Users\acarr\Documents\GitHub\599_team_project\data\capstone_master_tm_y01_v1.npy


### Review dataframe

Read in data from CSV, check resulting dataframe shape, and display first several records.

In [6]:
X01_df01 = pd.read_csv(file_in_path01)
print(f'Dataframe shape: {X01_df01.shape}')
display(X01_df01.head())

Dataframe shape: (36402, 4)


Unnamed: 0.1,Unnamed: 0,text_id,source_name,processed_text
0,0,2,USA Today,ever wanted keg titos handmade vodka dream bec...
1,1,3,USA Today,five months julian sands went missing solo hik...
2,2,5,USA Today,four star running back picks michigan state un...
3,3,6,USA Today,alabama center charles bediako signs one year ...
4,4,7,USA Today,ralph sampson breaks iconic boston houston roc...


In [7]:
y01_arr01 = np.load(file_in_path02)
print(f'NP array shape: {y01_arr01.shape}')
print(y01_arr01[:23])

NP array shape: (36402, 10)
[[0 0 1 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 1]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]]


In [8]:
X01_df01['multilabel'] = y01_arr01.tolist()

In [9]:
print(f'Dataframe shape: {X01_df01.shape}')
display(X01_df01.head())

Dataframe shape: (36402, 5)


Unnamed: 0.1,Unnamed: 0,text_id,source_name,processed_text,multilabel
0,0,2,USA Today,ever wanted keg titos handmade vodka dream bec...,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0]"
1,1,3,USA Today,five months julian sands went missing solo hik...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 1]"
2,2,5,USA Today,four star running back picks michigan state un...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,3,6,USA Today,alabama center charles bediako signs one year ...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,4,7,USA Today,ralph sampson breaks iconic boston houston roc...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [10]:
from skmultilearn.model_selection import iterative_train_test_split
#X_train, y_train, X_test, y_test = iterative_train_test_split(x, y, test_size = 0.1)

nlm_train_x01, \
nlm_test_x01, \
nlm_train_y01, \
nlm_test_y01 = train_test_split(X01_df01['processed_text'],
                                y01_arr01,
                                test_size=.2,
                                stratify=y01_arr01,
                                random_state=random_state)

#nlm_train_y01 = nlm_train_y01.ravel()
#nlm_test_y01 = nlm_test_y01.ravel()

print(f'{nlm_train_x01.shape}')
print(f'{nlm_train_y01.shape}')
print(f'\n{nlm_test_x01.shape}')
print(f'{nlm_test_y01.shape}')

print(nlm_train_x01)
print(nlm_train_y01)
print(type(nlm_train_y01))

(29121,)
(29121, 10)

(7281,)
(7281, 10)
15749    guess whos back former nfl coach john perry fo...
32908    forbes worlds richest people become b wealthie...
2433     experts answer readers home buying questions w...
2470     missed boat kellen moore recalls bad break qb ...
12351    london prosecutor calls oscar winning actor ke...
                               ...                        
22854    month third incarnation xfl concluded season l...
32711    rangers rival rangers islanders goalies push c...
35423    connecticut state representative assaulted att...
32846    evp digital energy global business schneider e...
19609    echo employees three years flexibility hybrid ...
Name: processed_text, Length: 29121, dtype: object
[[1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
<class 'numpy.ndarray'>


In [11]:
import numpy as np

# Assuming nlm_train_y01 is already defined as a NumPy array

# Check for NaN values in nlm_train_y01
has_nan_values = np.isnan(nlm_train_y01).any()

if has_nan_values:
    print("NaN values exist in nlm_train_y01.")
else:
    print("No NaN values in nlm_train_y01.")

No NaN values in nlm_train_y01.


### TF-IDF

In [12]:
print(X01_df01['processed_text'].shape)
print(X01_df01['processed_text'].head())

(36402,)
0    ever wanted keg titos handmade vodka dream bec...
1    five months julian sands went missing solo hik...
2    four star running back picks michigan state un...
3    alabama center charles bediako signs one year ...
4    ralph sampson breaks iconic boston houston roc...
Name: processed_text, dtype: object


In [13]:
sw = stopwords.words("english")

# Customize nltk stop word list
sw.extend(['',
           '️',
           'arent',
           'cannot',
           'cant',
           'couldnt',
           'couldve',
           'didnt',
           'doesnt',
           'dont',
           'hadnt',
           'hasnt',
           'havent',
           'hes',
           'im',
           "i'm",
           'isnt',
           'it’s',
           'ive',
           '𝚘𝚏',
           'mightnt',
           'mustnt',
           'neednt',
           'shant',
           'shes',
           'shouldnt',
           'shouldve',
           'thatll',
           'theyll',
           'theyve',
           'wasnt',
           'werent',
           'whats',
           'weve',
           'wont',
           'wouldnt',
           'wouldve',
           'yall',
           'youd',
           'youll',
           'youre',
           'youve',
           "we'll",
           "you’re",
           "you’ve",
           "you’ll",
           "you’d",
           "she’s",
           "it’s",
           "that’ll",
           "don’t",
           "should’ve",
           "aren’t",
           "couldn’t",
           "didn’t",
           "doesn’t",
           "hadn’t",
           "hasn’t",
           "haven’t",
           "isn’t",
           "mightn’t",
           "mustn’t",
           "needn’t",
           "shan’t",
           "shouldn’t",
           "wasn’t",
           "weren’t",
           "won’t",
           "wouldn’t",
           "i’m",
           "we’ll",
           'said',
           'told',
           'according',
           'reporting',
           'reported',
           'statement',
           'spoke',
           'next',
           'though',
           'often',
           'story',
           'updated',
           'additional',
           'developments',
           'follow',
           'published',
           'com',
           'sunday',
           'monday',
           'tuesday',
           'wednesday',
           'thursday',
           'friday',
           'saturday',
           'january',
           'february',
           'march',
           'april',
           'may',
           'june',
           'july',
           'august',
           'september',
           'october',
           'november',
           'december',
           'via',
           'account',
           'accounts',
           'article',
           'advertisement',
           'advertisements',
          ])

print(len(sw))
print(sw)

292
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [14]:
start_time = time.perf_counter()
nlm_tfidf = TfidfVectorizer(encoding='utf-8',
                            analyzer='word',
                            stop_words=sw,
                            token_pattern=r'(?u)\b\w\w+\b',
                            ngram_range=(1,2),
                            max_df=.7,
                            min_df=5)

nlm_train_x01_mtx = nlm_tfidf.fit_transform(nlm_train_x01)
nlm_test_x01_mtx = nlm_tfidf.transform(nlm_test_x01)

display(nlm_train_x01_mtx)
display(nlm_test_x01_mtx)

end_time = time.perf_counter()
print(f'\nElapsed processing time = {(end_time - start_time) / 60} mins')

<29121x398234 sparse matrix of type '<class 'numpy.float64'>'
	with 12196755 stored elements in Compressed Sparse Row format>

<7281x398234 sparse matrix of type '<class 'numpy.float64'>'
	with 2956083 stored elements in Compressed Sparse Row format>


Elapsed processing time = 0.7723571333333333 mins


In [15]:
def display_samp_dwm(sm=None,
                     vec=None,
                     n=(1,1),
                     rs_tup=(1,1)):
    mtx_df01 = pd.DataFrame(sm.toarray(),
                            columns=vec.get_feature_names_out())

    mtx_df01a = mtx_df01.sample(n=n[0],
                                random_state=rs_tup[0],
                                axis=1)

    mtx_df01b = mtx_df01a.sample(n=n[1],
                                 random_state=rs_tup[1],
                                 axis=0)

    display(mtx_df01b)
    return vec.get_feature_names_out()

### Write file without stop words to CSV - data subset 1

Set path to write CSV file to.

In [16]:
file_out_name01 = f'data_preprocessed_wo_sw_X_half1_{today}.csv'
file_out_name02 = f'data_preprocessed_wo_sw_X_half2_{today}.csv'

file_out_path01 = os.path.join(up1_dir, data_location, file_out_name01)
file_out_path02 = os.path.join(up1_dir, data_location, file_out_name02)

print(f'CSV file out 1 path: {file_out_path01}')
print(f'CSV file out 2 path: {file_out_path02}')

CSV file out 1 path: C:\Users\acarr\Documents\GitHub\599_team_project\data\data_preprocessed_wo_sw_X_half1_2023-07-27_13-51-36424666.csv
CSV file out 2 path: C:\Users\acarr\Documents\GitHub\599_team_project\data\data_preprocessed_wo_sw_X_half2_2023-07-27_13-51-36424666.csv


In [19]:
export_col_names_lst = ['text_id', 'source_name', 'processed_text', 'multilabel']

Write pandas dataframe to CSV; save locally.

In [20]:
len_half = int(round(len(X01_df01)/2,0))
print(len_half)

if to_csv_flag == False:
    pass
else:
    X01_df01[export_col_names_lst][:len_half].to_csv(file_out_path01,
                                                               index=False)
    X01_df01[export_col_names_lst][len_half:].to_csv(file_out_path02,
                                                               index=False)

18201


In [22]:
print(X01_df01.shape)

(36402, 5)


## Modeling

### Algorithm setup

### Random Forests Classifier - Using `BayesSearchCV`

In [None]:
'''Multiclass and multioutput algortihm citation:
https://scikit-learn.org/stable/modules
/multiclass.html#multiclass-multioutput-classification
'''
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import numpy as np
nlm_train_x01_mtx
nlm_train_y01
#X, y1 = make_classification(n_samples=10, n_features=100,
#y2 = shuffle(y1, random_state=1)
#y3 = shuffle(y1, random_state=2)
#Y = np.vstack((y1, y2, y3)).T
start_time = time.perf_counter()
n_samples, n_features = nlm_train_x01_mtx.shape # 10,100
n_outputs = nlm_train_y01.shape[1] # 3
n_classes = 10
forest = RandomForestClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest)
multi_target_forest.fit(nlm_train_x01_mtx, nlm_train_y01)

end_time = time.perf_counter()
print(f'\nElapsed processing time = {(end_time - start_time) / 60} mins')

In [None]:
#writable_nlm_train_x01_mtx = np.copy(nlm_train_x01_mtx)

multi_target_forest.predict(nlm_train_x01_mtx)

In [None]:
# Start timer script
start_time = dt.datetime.today()

# Citation: Hochberg, 2018; Shanmukh, 2021
m1v1_rfc_pip = Pipeline([('rfc',
                          RandomForestClassifier(random_state=random_state))])

rskf_splits = 2
rskf_repeats = 2
bsg_iters = 5
bsg_jobs = 2

#loss_hparam = Categorical(['log_loss', 'exponential'])
#lrate_hparam = Real(1e-3, 1e3, prior='log-uniform')
#nest_hparam = Integer(1e2, 1e3, prior='log-uniform')

nest_hparam = Integer(1e2, 1e3, prior='log-uniform')
mndepth_hparam = Integer(1e0, 1e2, prior='log-uniform')
mnsamps_hparam = Integer(1e0, 1e2, prior='log-uniform')
mnsampl_hparam = Integer(1e0, 1e2, prior='log-uniform')
mxfeat_hparam = Categorical(['sqrt', 'log2', None])
minimpd_hparam = Real(1e-3, 1e3, prior='log-uniform')
oob_hparam = Categorical([True, False])
ccp_hparam = Real(1e-3, 1e3, prior='log-uniform')
maxsamp_hparam = Real(1e-3, 1e0, prior='log-uniform')

m1v1_rfc_grd = {'rfc__n_estimators': nest_hparam,
                'rfc__max_depth': mndepth_hparam,
                'rfc__min_samples_split': mnsamps_hparam,
                'rfc__min_samples_leaf': mnsampl_hparam,
                'rfc__max_features': mxfeat_hparam,
                'rfc__min_impurity_decrease': minimpd_hparam,
                'rfc__oob_score': oob_hparam,
                'rfc__ccp_alpha': ccp_hparam,
                'rfc__max_samples': maxsamp_hparam,
               }

'''Change rfc default scoring from accuracy to F1 score citation:
https://chat.openai.com/share/254f382b-4a8e-48e8-acd5-2918f0bbc59d
'''
f1_scorer = make_scorer(f1_score,
                        pos_label='right')

'''Customize cross-validation citation:
https://machinelearningmastery.com
/scikit-optimize-for-hyperparameter-tuning-in-machine-learning/
'''
cv = RepeatedStratifiedKFold(n_splits=rskf_splits,
                             n_repeats=rskf_repeats,
                             random_state=random_state)

m1v1_rfc = BayesSearchCV(m1v1_rfc_pip,
                         m1v1_rfc_grd,
                         n_iter=bsg_iters,
                         scoring=None,
                         cv=cv,
                         n_jobs=bsg_jobs,
                         refit=True,
                         verbose=4,
                         random_state=random_state)

m1v1_rfc.fit(nlm_train_x01_mtx, nlm_train_y01)

# End timer script
end_time = dt.datetime.today()
time_elapse = end_time - start_time
print(f'Start Time = {start_time}')
print(f'End Time = {end_time}')
print(f'Elapsed Time = {time_elapse}')

### Pickle best model

In [None]:
# Path to save the pickled model
mod_folder_name = 'trained_models'
m1v1_pkl_file_name = 'm1v1_rfc.pkl'

pkl_file_path01 = os.path.join(curr_dir, mod_folder_name, m1v1_pkl_file_name)

print(f'Pickle file 1 in path: {pkl_file_path01}')

### Load pickled best model

In [None]:
with open(pkl_file_path01, 'rb') as file:
    m1v1_rfc = pickle.load(file)

In [None]:
print(f'\nBest Estimator:\n{m1v1_rfc.best_estimator_}')

print('\nCross-validaton results:')
display(pd.DataFrame(m1v1_rfc.cv_results_))

train_m1v1_rfc_y01_pred = m1v1_rfc.predict_proba(nlm_train_x01_mtx)
print(f'\nFirst 10 train set predictions:\n{train_m1v1_rfc_y01_pred[:10]}')

test_m1v1_rfc_y01_pred = m1v1_rfc.predict_proba(nlm_test_x01_mtx)
print(f'\nFirst 10 test set predictions:\n{test_m1v1_rfc_y01_pred[:10]}')

print(f'\nBest Score for "{m1v1_rfc.scorer_}" is {m1v1_rfc.best_score_}')

#### Train set check

In [None]:
nlm_train_y01_pred = m1v1_rfc.predict(nlm_train_x01_mtx)
nlm_train_y01_pred_cm = confusion_matrix(nlm_train_y01, nlm_train_y01_pred)

print(classification_report(nlm_train_y01, nlm_train_y01_pred))
print(nlm_train_y01_pred_cm)

'''Citation:
https://scikit-learn.org/stable/modules/generated
/sklearn.metrics.ConfusionMatrixDisplay.html
#sklearn.metrics.ConfusionMatrixDisplay.plot
'''
nlm_train_cm_dsp = ConfusionMatrixDisplay(confusion_matrix=nlm_train_y01_pred_cm,
                                          display_labels=m1v1_rfc.classes_)
nlm_train_cm_dsp.plot()
plt.show()

#### ROC-AUC Curve

In [None]:
nlm_train_y01_pred_decf = m1v1_rfc.decision_function(nlm_train_x01_mtx)
RocCurveDisplay.from_predictions(nlm_train_y01, nlm_train_y01_pred_decf,
                                 pos_label='right')
plt.show()

#### Test set results

In [None]:
nlm_test_y01_pred = m1v1_rfc.predict(nlm_test_x01_mtx)
nlm_test_y01_pred_cm = confusion_matrix(nlm_test_y01, nlm_test_y01_pred)

print('Test Set Evaluation Metrics')
print(classification_report(nlm_test_y01, nlm_test_y01_pred))
print(nlm_test_y01_pred_cm)

'''Citation:
https://scikit-learn.org/stable/modules/generated
/sklearn.metrics.ConfusionMatrixDisplay.html
#sklearn.metrics.ConfusionMatrixDisplay.plot
'''
nlm_test_cm_dsp = ConfusionMatrixDisplay(confusion_matrix=nlm_test_y01_pred_cm,
                                         display_labels=m1v1_rfc.classes_)
nlm_test_cm_dsp.plot()
plt.show()

#### Variable importance

In [None]:
print(nlm_train_x01_mtx_cols)
print(type(nlm_train_x01_mtx_cols))
print(nlm_train_x01_mtx_cols.shape)

x = m1v1_rfc.best_estimator_.named_steps['rfc'].feature_importances_
x_df01 = pd.DataFrame(x, columns=['var_imp'])
x_df01['feature'] = nlm_train_x01_mtx_cols
x_df02 = x_df01.sort_values(by=['var_imp'], ascending=False)
x_df03 = x_df02.head(20)

display(x_df02.head())
print(type(x_df02))
print(x_df02.shape)

In [None]:
'''Citation:
https://machinelearningmastery.com/calculate-feature-importance-with-python/
'''
# plot feature importance
#figure = plt.figsize((10,9))
plt.figure(figsize=(15,7))
plt.title('Feature Importance (Top 20)')
plt.barh([x for x in range(len(x_df03['var_imp']))], x_df03['var_imp'],
         tick_label=x_df03['feature'])
plt.show()

In [None]:
TNmodel1=nlm_test_y01_pred_cm[0][0]
FPmodel1=nlm_test_y01_pred_cm[0][1]
FNmodel1=nlm_test_y01_pred_cm[1][0]
TPmodel1=nlm_test_y01_pred_cm[1][1]

In [None]:
# Results:
from tabulate import tabulate

TANmodel1=TNmodel1+FPmodel1
TAPmodel1=TPmodel1+FNmodel1
TPPmodel1=FPmodel1+TPmodel1
TPNmodel1=TNmodel1+FNmodel1
GTmodel1=TANmodel1+TAPmodel1
AccuracyM1=(TNmodel1+TPmodel1)/GTmodel1
ErrorRateM1=1-AccuracyM1
SensitivityM1=TPmodel1/(TAPmodel1)
RecallM1=SensitivityM1
SpecificityM1=TNmodel1/TANmodel1
PrecisionM1=TPmodel1/TPPmodel1
F1M1=2*PrecisionM1*RecallM1/(PrecisionM1 + RecallM1)
F2M1=5*(PrecisionM1*RecallM1)/((4*PrecisionM1)+RecallM1)
Fp5M1=(1.25)*(PrecisionM1*RecallM1)/((0.25*PrecisionM1)+RecallM1)

header = ["Accuracy", "Error Rate", "Sensitivity", "Recall", "Specificity",
          "Precision", "F1", "F2", "F0.5"]
data1 = [["Accuracy", AccuracyM1], ["Error Rate", ErrorRateM1],
         ["Sensitivity", SensitivityM1],
         ["Recall", RecallM1], ["Specificity", SpecificityM1],
         ["Precision", PrecisionM1],
         ["F1", F1M1], ["F2", F2M1], ["F0.5", Fp5M1]]

col_names=["Measurement", "Linear SVC Model"]

ModelEvaluationTable = tabulate(data1, headers=col_names,
                                tablefmt="fancy_grid")

print(ModelEvaluationTable)

In [None]:
data1

Set list of dataframe columns to export.

In [None]:
export_col_names_lst = ['processed_text',
                       ]

## Display runtime

In [None]:
global_end_time = time.perf_counter()

In [None]:
print(f'''\nElapsed processing time = {round((global_end_time 
- global_start_time)/60,2)} mins''')

## References