In [1]:
#Workbook: gaggleExerciseV1, Bob LoGalbo, 30 September 2021

In [2]:
# Primary setup reference:  https://www.kaggle.com/giovanimachado/hate-speech-bert-cnn-and-bert-mlp-in-tensorflow
# https://keras.io/api to setup tensors and layers.
# https://keras.io/api/metrics/ for metrics.
# To use the classification_report:  
# //scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as ttt
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [4]:
print(tf.version.VERSION)

2.6.0


In [5]:
df_email = pd.read_csv('dataset.psv', sep='|', index_col=False)

In [6]:
df_email.head()

Unnamed: 0,target,text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [7]:
#Converting classes to scalars (ints cast to floats later)

In [8]:
df_email['target'].replace({"ham":0,"spam":1},inplace = True)
df_email['target'] = df_email['target'].astype(int)
df_email.dtypes

target     int32
text      object
dtype: object

In [9]:
#A little exploratory analysis to look at distribution of classes

In [10]:
scalar_categoryCounts = df_email['target'].value_counts()
scalar_categoryCounts

0    4824
1     746
Name: target, dtype: int64

In [11]:
# Splitting Ham and Spam into their own dataframes

In [12]:
df_onlyHam = df_email[df_email['target'] == 0]
df_onlySpam = df_email[df_email['target'] == 1]
df_onlySpam.shape

(746, 2)

In [13]:
df_onlySpam.head()

Unnamed: 0,target,text
1,1,Free entry in 2 a wkly comp to win FA Cup fina...
6,1,WINNER!! As a valued network customer you have...
7,1,Had your mobile 11 months or more? U R entitle...
9,1,"SIX chances to win CASH! From 100 to 20,000 po..."
10,1,URGENT! You have won a 1 week FREE membership ...


In [14]:
df_onlyHam.shape

(4824, 2)

In [15]:
df_onlyHam.head()

Unnamed: 0,target,text
0,0,I've been searching for the right words to tha...
2,0,"Nah I don't think he goes to usf, he lives aro..."
3,0,Even my brother is not like to speak with me. ...
4,0,I HAVE A DATE ON SUNDAY WITH WILL!!
5,0,As per your request 'Melle Melle (Oru Minnamin...


In [16]:
#Creating a balanced train and test dataframe because the desired class is in the vast minority (more later on this)

In [17]:
df_ttSet = pd.concat([df_onlySpam,df_onlyHam.sample(df_onlySpam.shape[0])])

In [18]:
df_ttSet['target'].value_counts()

1    746
0    746
Name: target, dtype: int64

In [19]:
df_ttSet.shape

(1492, 2)

In [20]:
df_ttSet.head()

Unnamed: 0,target,text
1,1,Free entry in 2 a wkly comp to win FA Cup fina...
6,1,WINNER!! As a valued network customer you have...
7,1,Had your mobile 11 months or more? U R entitle...
9,1,"SIX chances to win CASH! From 100 to 20,000 po..."
10,1,URGENT! You have won a 1 week FREE membership ...


In [21]:
#Because of an imbalanced training set, 'stratify' set to be equal to spam corpus to ensure train & test are 50/50 ham to spam.

In [22]:
#Imbalanced sets are tricky when the minority class is the desired class to classify (this comment had a lot of 'class' :)

In [23]:
#General rule of thumb: when the minority class is the desired class, use all of it and SOME of the majority class.

In [24]:
array_Xtrain,array_Xtest,array_Ytrain,array_Ytest = ttt(df_ttSet['text'], df_ttSet['target'],stratify = df_ttSet['target'])

In [25]:
#Using a pretrained English, non-captilized Bert Transformer neural network (not RNN nor CNN) with 12 'Attention' heads.

In [26]:
#The concept of Bert is bigger than this code & is worthy of its own discussion. Now load the tokenizer & encoder respectively.

In [27]:
layer_bertPrePro = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

In [28]:
layer_bertEncoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [29]:
tensor_in = tf.keras.layers.Input(shape = (), dtype = tf.string, name = 'dataIn')

In [30]:
#Tokenize the phrases and encode each token into its respective features below. The line above is the initial unloaded tensor.

In [31]:
tensor_tokenized = layer_bertPrePro(tensor_in)

In [32]:
tensor_encoded = layer_bertEncoder(tensor_tokenized)

In [33]:
bert_L1 = tensor_encoded['pooled_output']

In [34]:
#Dropout chosen to be .05 to minimize overfit i.e. it regulates the output a tad.  1 or more tensors comprises a neural layer.

In [35]:
bert_L2 = tf.keras.layers.Dropout(.05)(bert_L1)

In [36]:
#Sigmoid (logistic hypothesis function) = 'go-to' binary classification activation function despite vanishing derivative risk.

In [37]:
#The vanishing derivative occurs when a high value hits the sigmoid and the derivative of a constant is 0, killing backprop.

In [38]:
#The 'Dense' layer essentially pools all inputs into one node - one node because we're doing binary classification.

In [39]:
bert_L3 = tf.keras.layers.Dense(1,activation = 'sigmoid', name = 'spamClassifier')(bert_L2)

In [40]:
bert_model = tf.keras.Model(tensor_in,bert_L3)

In [41]:
#The metrics were chosen so to keep on eye on, not only accuracy, precision and recall but look at the true/false 

In [42]:
#positive/negative counts.  The area under the ROC curve (TPrate vs FPrate curve) is also tracked.

In [43]:
list_metrics = [tf.keras.metrics.BinaryAccuracy(name = 'Accuracy', dtype = 'float', threshold = .5),
                tf.keras.metrics.Precision(name = 'Precision', dtype = 'float'),
                tf.keras.metrics.Recall(name = 'Recall', dtype = 'float'),
                tf.keras.metrics.AUC(name = 'AU_ROC', dtype = 'float'),
                tf.keras.metrics.TruePositives(name = 'TP', dtype = 'float'),
                tf.keras.metrics.TrueNegatives(name = 'TN', dtype = 'float'),
                tf.keras.metrics.FalsePositives(name = 'FP', dtype = 'float'),
                tf.keras.metrics.FalseNegatives(name = 'FN', dtype = 'float')]

In [44]:
#Adam is the 'go-to' gradient descent optimizer for binary classification.  The learning rate was set to an optimal .005.

In [45]:
#The hope is to get close to the contour's concave inflection point (small learning rates & sigmoids risk that vanishing effect)

In [46]:
#Small learning rates mean get very close to partial derivative slopes going to 0 which is a problem with sigmoids.

In [47]:
#But this learning rate isn't that small in general terms so anticipate getting the best of both worlds.

In [48]:
opt = tf.keras.optimizers.Adam(learning_rate=0.005)

In [49]:
#Classical approach to binary classification using the binary_crossentropy as the cost function 

In [50]:
bert_model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = list_metrics) 

In [51]:
#One can see below, that after the 6th training epoch, steady state was reached i.e. the 10 epochs was more than sufficient.

In [52]:
bert_model_fitted = bert_model.fit(array_Xtrain, array_Ytrain, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [53]:
#model.evaluate and model.predict are essentially the same processes just that predict gives you the raw, unsliced output.

In [54]:
#Regarding the confusion matrix, the reader can see the TP, TN, FP, FN counts below when executing the test vectors.

In [55]:
#The accuracy, precision and recall are promising for continued production potential (excellent ROC as well)

In [56]:
bert_model.evaluate(array_Xtest,array_Ytest)



[0.1711208075284958,
 0.9490616917610168,
 0.95652174949646,
 0.9411764740943909,
 0.9819878935813904,
 176.0,
 178.0,
 8.0,
 11.0]

In [57]:
array_yPredicted = bert_model.predict(array_Xtest)

In [58]:
array_yPredictedFlattened = array_yPredicted.flatten()

In [59]:
#Classical MLE slicing to quantize to either 0 or 1 using same variances for both distributions.

In [60]:
array_yPredictedFlattenedQuantized = [0.0 if x < .5 else 1.0 for x in array_yPredictedFlattened]

In [61]:
array_YtestFloat = array_Ytest.astype('float32')

In [62]:
from sklearn.metrics import classification_report

In [63]:
#The sklearn classifcation report creates a nice layout of key metrics.

In [64]:
#Essentially same output as model.evaluate with the additional harmonic mean of recall and precision(i.e. F1)

In [65]:
print(classification_report(array_Ytest,array_yPredictedFlattenedQuantized))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       186
           1       0.96      0.94      0.95       187

    accuracy                           0.95       373
   macro avg       0.95      0.95      0.95       373
weighted avg       0.95      0.95      0.95       373



In [66]:
#Both the training and test results show a good balance of fit to bias.  All of the spam corpus used for either train or test.

In [67]:
#Little bias shown in training; and test scored comparably high, matching training, indicating no perceptible overfit.

In [68]:
#For example, accuracy consistent in test and in train.  

In [69]:
#Same for F1 score (consistent in both test/train): maintains constancy with small 1% tradeoff bounces between precison & recall

In [70]:
#'In the wild', there may be more spam and ham variance than is captured by the sample used for this exercise.

In [71]:
#More variance in the wild implies a bigger dropout needed to regulate overfitting, which would reduce scores after retraining.

In [72]:
#This is an English model only (other Bert languages models available.)  Capital lettered words not examined in this exercise.

In [74]:
#So I would look to expanding the spam corpus; investigating capital letters; tuning the dropouts and learning rate.

In [75]:
#Note that I kept the raw scores out of the final Dense layer.

In [76]:
#The MLE slicer assumed equal variance of both distributions which is another point to analyze.

In [77]:
#And because raw scores were captured, can look back, scrutenize the FN's and FP's and look for commonalities in the spam text.

In [78]:
#Google implements Bert for autoComplete today so I would expect Bert is an excellent approach for the wild.

In [79]:
array_Ytest.shape

(373,)

In [80]:
array_Ytrain.shape

(1119,)