In [1]:
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import datetime as dt
import emoji
import itertools
import json
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import pymysql as mysql
import random
import re
import regex as rex
import requests
import shutil
from string import punctuation
import time
from tqdm import tqdm
import zipfile

import nltk
from nltk.corpus import stopwords
import spacy

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sklearn.feature_extraction.text import TfidfTransformer, \
CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn import metrics
from sklearn.metrics import make_scorer, f1_score, classification_report, \
confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay

import textacy.preprocessing as tprep
from textacy.extract import keyword_in_context

# Set pandas global options
pd.options.display.max_rows = 17
pd.options.display.precision = 4
np.set_printoptions(suppress=True, precision=4)

%matplotlib inline

## Upload data from CSV

In [2]:
'''Dir nav citation:
https://softhints.com/python-change-directory-parent/'''
curr_dir = os.path.abspath(os.curdir)
print(curr_dir)
os.chdir("..")
up1_dir = os.path.abspath(os.curdir)
print(up1_dir)

C:\Users\acarr\Documents\GitHub\ADS509_Final_project\deliverables
C:\Users\acarr\Documents\GitHub\ADS509_Final_project


In [5]:
# change `data_location` to the location of the folder on your machine.
data_location = 'data'

file_in_name01 = 'master.csv'
file_in_name02 = 'master_business_TheHill.csv'

file_in_path01 = os.path.join(up1_dir, data_location, file_in_name01)
file_in_path02 = os.path.join(curr_dir, file_in_name02)

print(f'CSV file 1 in path: {file_in_path01}')
print(f'CSV file 2 in path: {file_in_path02}')

CSV file 1 in path: C:\Users\acarr\Documents\GitHub\ADS509_Final_project\data\master.csv
CSV file 2 in path: C:\Users\acarr\Documents\GitHub\ADS509_Final_project\deliverables\master_business_TheHill.csv


In [8]:
# Path to save the pickled model
mod_folder_name = 'trained_models'
m2v1_pkl_file_name = 'm2v2_gbc.pkl'

pkl_file_path01 = os.path.join(curr_dir, mod_folder_name, m2v1_pkl_file_name)

print(f'CSV file 1 in path: {pkl_file_path01}')

CSV file 1 in path: C:\Users\acarr\Documents\GitHub\ADS509_Final_project\deliverables\trained_models\m2v2_gbc.pkl


### Load pickled best model

In [9]:
with open(pkl_file_path01, 'rb') as file:
    m2v1_gbc = pickle.load(file)

In [10]:
print(f'\nBest Estimator:\n{m2v1_gbc.best_estimator_}')

print('\nCross-validaton results:')
display(pd.DataFrame(m2v1_gbc.cv_results_))

train_m2v1_gbc_y01_pred = m2v1_gbc.predict_proba(nlm_train_x01_mtx)
print(f'\nFirst 10 train set predictions:\n{train_m2v1_gbc_y01_pred[:10]}')

test_m2v1_gbc_y01_pred = m2v1_gbc.predict_proba(nlm_test_x01_mtx)
print(f'\nFirst 10 test set predictions:\n{test_m2v1_gbc_y01_pred[:10]}')

print(f'\nBest Score for "{m2v1_gbc.scorer_}" is {m2v1_gbc.best_score_}')


Best Estimator:
Pipeline(steps=[('gbc',
                 GradientBoostingClassifier(learning_rate=0.8373240042701702,
                                            loss='exponential', max_depth=11,
                                            max_features='sqrt',
                                            min_samples_split=0.582912491747238,
                                            random_state=1699))])

Cross-validaton results:


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_gbc__learning_rate,param_gbc__loss,param_gbc__max_depth,param_gbc__max_features,param_gbc__min_samples_split,param_gbc__n_estimators,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,12.9224,1.286,0.0274,0.007,0.0162,log_loss,1,log2,0.0119,510,...,0.8142,0.8132,0.8125,0.8135,0.8149,0.8132,0.8146,0.814,0.0008,5
1,10.548,0.9983,0.0296,0.0091,0.035,exponential,1,sqrt,0.2197,403,...,0.8587,0.8598,0.8574,0.8606,0.8582,0.8606,0.873,0.8627,0.0058,3
2,2.9297,0.1453,0.0285,0.0057,55.3871,exponential,3,sqrt,0.8602,108,...,0.8135,0.8125,0.8128,0.8128,0.8135,0.8135,0.8125,0.813,0.0004,6
3,426.3148,43.3892,0.0356,0.0089,4.7311,exponential,6,,0.0114,232,...,0.8246,0.8326,0.8366,0.8274,0.8289,0.8417,0.8239,0.8362,0.0098,4
4,5.683,0.4884,0.0306,0.0039,25.6546,log_loss,5,log2,0.0535,189,...,0.033,0.1426,0.0247,0.0868,0.1685,0.0631,0.0741,0.163,0.2219,12
5,13.7448,1.636,0.061,0.008,23.2484,log_loss,11,log2,0.0484,406,...,0.129,0.1323,0.0913,0.1135,0.1092,0.1381,0.1374,0.128,0.0231,15
6,11.9362,1.3784,0.0471,0.0051,173.3432,log_loss,11,log2,0.3327,377,...,0.0929,0.1084,0.0779,0.0732,0.1051,0.0958,0.0854,0.1587,0.2173,14
7,248.6976,19.5832,0.0346,0.0038,36.8407,exponential,3,,0.0303,309,...,0.6097,0.8152,0.8147,0.5569,0.6498,0.7817,0.6,0.7248,0.1012,11
8,7.2956,0.6681,0.0286,0.0028,274.3051,log_loss,1,log2,0.4011,256,...,0.0,0.0,0.0,0.0,0.0,0.0,0.8125,0.1626,0.3252,13
9,4.1264,0.3624,0.0399,0.0042,0.5714,exponential,16,log2,0.4647,122,...,0.8986,0.8984,0.8884,0.9009,0.9066,0.8851,0.9078,0.8991,0.0076,2


NameError: name 'nlm_train_x01_mtx' is not defined

#### Train set check

In [None]:
nlm_train_y01_pred = m2v1_gbc.predict(nlm_train_x01_mtx)
nlm_train_y01_pred_cm = confusion_matrix(nlm_train_y01, nlm_train_y01_pred)

print(classification_report(nlm_train_y01, nlm_train_y01_pred))
print(nlm_train_y01_pred_cm)

'''Citation:
https://scikit-learn.org/stable/modules/generated
/sklearn.metrics.ConfusionMatrixDisplay.html
#sklearn.metrics.ConfusionMatrixDisplay.plot
'''
nlm_train_cm_dsp = ConfusionMatrixDisplay(confusion_matrix=nlm_train_y01_pred_cm,
                                          display_labels=m2v1_gbc.classes_)
nlm_train_cm_dsp.plot()
plt.show()

#### ROC-AUC Curve

In [None]:
nlm_train_y01_pred_decf = m2v1_gbc.decision_function(nlm_train_x01_mtx)
RocCurveDisplay.from_predictions(nlm_train_y01, nlm_train_y01_pred_decf,
                                 pos_label='right')
plt.show()

#### Test set results

In [None]:
nlm_test_y01_pred = m2v1_gbc.predict(nlm_test_x01_mtx)
nlm_test_y01_pred_cm = confusion_matrix(nlm_test_y01, nlm_test_y01_pred)

print('Test Set Evaluation Metrics')
print(classification_report(nlm_test_y01, nlm_test_y01_pred))
print(nlm_test_y01_pred_cm)

'''Citation:
https://scikit-learn.org/stable/modules/generated
/sklearn.metrics.ConfusionMatrixDisplay.html
#sklearn.metrics.ConfusionMatrixDisplay.plot
'''
nlm_test_cm_dsp = ConfusionMatrixDisplay(confusion_matrix=nlm_test_y01_pred_cm,
                                         display_labels=m2v1_gbc.classes_)
nlm_test_cm_dsp.plot()
plt.show()

#### Variable importance

In [None]:
print(nlm_train_x01_mtx_cols)
print(type(nlm_train_x01_mtx_cols))
print(nlm_train_x01_mtx_cols.shape)

x = m2v1_gbc.best_estimator_.named_steps['gbc'].feature_importances_
x_df01 = pd.DataFrame(x, columns=['var_imp'])
x_df01['feature'] = nlm_train_x01_mtx_cols
x_df02 = x_df01.sort_values(by=['var_imp'], ascending=False)
x_df03 = x_df02.head(20)

display(x_df02.head())
print(type(x_df02))
print(x_df02.shape)

In [None]:
'''Citation:
https://machinelearningmastery.com/calculate-feature-importance-with-python/
'''
# plot feature importance
#figure = plt.figsize((10,9))
plt.figure(figsize=(15,7))
plt.title('Feature Importance (Top 20)')
plt.barh([x for x in range(len(x_df03['var_imp']))], x_df03['var_imp'],
         tick_label=x_df03['feature'])
plt.show()

In [None]:
TNmodel1=nlm_test_y01_pred_cm[0][0]
FPmodel1=nlm_test_y01_pred_cm[0][1]
FNmodel1=nlm_test_y01_pred_cm[1][0]
TPmodel1=nlm_test_y01_pred_cm[1][1]

In [None]:
# Results:

TANmodel1=TNmodel1+FPmodel1
TAPmodel1=TPmodel1+FNmodel1
TPPmodel1=FPmodel1+TPmodel1
TPNmodel1=TNmodel1+FNmodel1
GTmodel1=TANmodel1+TAPmodel1
AccuracyM1=(TNmodel1+TPmodel1)/GTmodel1
ErrorRateM1=1-AccuracyM1
SensitivityM1=TPmodel1/(TAPmodel1)
RecallM1=SensitivityM1
SpecificityM1=TNmodel1/TANmodel1
PrecisionM1=TPmodel1/TPPmodel1
F1M1=2*PrecisionM1*RecallM1/(PrecisionM1 + RecallM1)
F2M1=5*(PrecisionM1*RecallM1)/((4*PrecisionM1)+RecallM1)
Fp5M1=(1.25)*(PrecisionM1*RecallM1)/((0.25*PrecisionM1)+RecallM1)

header = ["Accuracy", "Error Rate", "Sensitivity", "Recall", "Specificity",
          "Precision", "F1", "F2", "F0.5"]
data1 = [["Accuracy", AccuracyM1], ["Error Rate", ErrorRateM1],
         ["Sensitivity", SensitivityM1],
         ["Recall", RecallM1], ["Specificity", SpecificityM1],
         ["Precision", PrecisionM1],
         ["F1", F1M1], ["F2", F2M1], ["F0.5", Fp5M1]]

col_names=["Measurement", "Linear SVC Model"]

ModelEvaluationTable = tabulate(data1, headers=col_names,
                                tablefmt="fancy_grid")

print(ModelEvaluationTable)

In [None]:
data1

In [None]:
Data_metric_results_TheHill=pd.DataFrame(data1)
Data_metric_results_TheHill.head()

In [None]:
Data_metric_results_TheHill.rename (columns = {0:'Measurement'}, inplace=True) 
Data_metric_results_TheHill.rename (columns = {1:'Result'}, inplace=True) 

In [None]:
#plt.bar(x=ModelEvaluationTable)


ax=Data_metric_results_TheHill[(Data_metric_results_TheHill['Measurement'] == 'Accuracy') | 
                            (Data_metric_results_TheHill['Measurement'] == 'Recall') |
                            (Data_metric_results_TheHill['Measurement'] == 'F1') |
                            (Data_metric_results_TheHill['Measurement'] == 'Error Rate')].plot(kind="barh", 
                                                                                               x='Measurement',
                                  figsize=(5,6),
                                  title='Linear SVC Performance metrics on Test Data')
ax.bar_label(ax.containers[0])
ax.set_xlim(right=1.15)

## Business problem application

In [None]:
center_df01 = pd.read_csv(file_in_path02)

print(center_df01.shape)
display(center_df01.head())

In [None]:
# Apply transformers to pandas dataframe, w/ new col containing tokens
center_df01['processed_text'] = center_df01['article_text']\
.progress_apply(prepare, pipeline=transformers01)

center_df01['processed_text_split'] = center_df01['processed_text']\
.progress_apply(str.split)

center_df01['num_tokens'] = center_df01['processed_text_split']\
.map(len)

display(center_df01.head())

# Review unique tokens across entire dataset
for c in range(0,1):
    try:
        print(center_df01['processed_text'][c], '\n')
    except:
        print(f'Skip {c}')

In [None]:
nlm_apply_x01_mtx = nlm_tfidf.transform(center_df01['processed_text'])

print(nlm_apply_x01_mtx.shape)
display(nlm_apply_x01_mtx)

In [None]:
display_samp_dwm(sm=nlm_apply_x01_mtx,
                 vec=nlm_tfidf,
                 n=(17,11),
                 rs_tup=(5,1699))

In [None]:
nlm_apply_mtx_pred_prob = m2v1_gbc.predict_proba(nlm_apply_x01_mtx)

print(nlm_apply_mtx_pred_prob.shape)
print(nlm_apply_mtx_pred_prob[:10])

nlm_apply_mtx_pred = m2v1_gbc.predict(nlm_apply_x01_mtx)

print(nlm_apply_mtx_pred.shape)
print(nlm_apply_mtx_pred)

In [None]:
# Compute the maximum values along the second dimension
max_values = np.amax(nlm_apply_mtx_pred_prob, axis=1)
max_values_df01 = pd.DataFrame(max_values,
                               columns=['decision_prob'])
max_values_df01['pred'] = nlm_apply_mtx_pred
print(max_values_df01.shape)
display(max_values_df01.head())

In [None]:
max_values_df01['decision_prob'].plot(kind="hist",
                                      density=True,
                                      alpha=0.5,
                                      legend=True,
                                      figsize=(10,7),
title='''Gradient Boost Model Probability Distribution\n
                                      Applied to Customer Articles''')

In [None]:
max_values_df01.groupby('pred')['decision_prob'].plot(kind="hist",
                                                      density=True,
                                      alpha=0.5,
                                      legend=True,
                                      figsize=(10,7),
                          title='''Gradient Boost Model Probability Distribution
Prediction Confidence''')

In [None]:
max_values_df02 = pd.DataFrame(nlm_apply_mtx_pred_prob.round(4),
                               columns=['left', 'right'])
max_values_df02['pred'] = nlm_apply_mtx_pred
max_values_df02

In [None]:
# Plotting histograms
#plt.hist(max_values_df02['left'], bins=10, alpha=0.5, color='blue', label='Column 1')
plt.hist(max_values_df02['right'], bins=10, alpha=0.5, color='red',
         label='right')

# Adding legend and title
plt.legend()
plt.title('Histogram of Right Prediction Probabilities')

# Displaying the plot
plt.show()

In [None]:
# Plotting histograms
#plt.hist(max_values_df02['left'], bins=10, alpha=0.5, color='blue', label='Column 1')
plt.hist(max_values_df02['left'], bins=10, alpha=0.5, color='blue',
         label='left')

# Adding legend and title
plt.legend()
plt.title('Histogram of Left Prediction Probabilities')

# Displaying the plot
plt.show()

In [None]:
# Plotting histograms
plt.hist(max_values_df02['left'], bins=10,
         alpha=0.5, color='blue', label='left')
plt.hist(max_values_df02['right'], bins=10,
         alpha=0.5, color='red', label='right')

# Adding legend and title
plt.legend()
plt.title('Histogram of Left/Right Prediction Probabilities')

# Displaying the plot
plt.show()