Glove Embedding + Logistic Regression (cammy code)

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm #display progress bar
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import punkt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_recall_fscore_support

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.shape

(50960, 10)

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean,cleaned_comment_text
0,2f52adcf5a111cd3,"That is about it for for now. Primarily, I wor...",0,0,0,0,0,0,1,primarily work citations either add update rel...
1,819b3339c747286f,"""\n I wasn't aware that peer-reviewed studies ...",0,0,0,0,0,0,1,aware peerreviewed study minimal methodologica...
2,b66e5fffbd70f8fe,"""\nIt's fine to edit for personal gain so long...",0,0,0,0,0,0,1,fine edit personal gain long edit accord basic...
3,fd7f2ec6efe0315d,I did not add these words to the PLANS website...,0,0,0,0,0,0,1,add word plan website contact plan plan skepti...
4,f78b624060552c1a,"""\n\n List of recent changes \n\nRequested by ...",0,0,0,0,0,0,1,list recent change request sarge baldy even th...


In [4]:
# Create X and y dataframe
X = df["cleaned_comment_text"]
y = df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [5]:
X.head()

0    primarily work citations either add update rel...
1    aware peerreviewed study minimal methodologica...
2    fine edit personal gain long edit accord basic...
3    add word plan website contact plan plan skepti...
4    list recent change request sarge baldy even th...
Name: cleaned_comment_text, dtype: object

In [6]:
y.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [7]:
#Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2019)

In [8]:
X_train.head()

30200                              perhaps perhaps che pig
34498    three women elgar life watch min tv documentar...
25278    edit infobox add audio p nks performance live ...
29073    weird indeed put four tildes keep come unsigne...
9290                               contrary recommendation
Name: cleaned_comment_text, dtype: object

In [9]:
X_test.head()

15420    look good change source talk theme musical sty...
19180    really dog photo dog steal post page nastered ...
46897                     listen children night music make
4110            thank thank redirect link idk already page
23094                                    tribe war muslims
Name: cleaned_comment_text, dtype: object

In [10]:
type(X_test)

pandas.core.series.Series

In [11]:
# 3min Found 2195885 word vectors.
embeddings_index = {}
f = open('glove.840B.300d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()
print('Found %s word vectors.' % len(embeddings_index))

2196018it [05:17, 6908.78it/s]


Found 2195885 word vectors.


In [12]:
stop_words = stopwords.words('english')
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    # remove stop words
    words = [w for w in words if not w in stop_words]
    # remove non-alphabets
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [13]:
%%time
# 17s create sentence vectors using the above function for training and test set
xtrain_glove = [sent2vec(x) for x in tqdm(X_train)]
xtest_glove = [sent2vec(x) for x in tqdm(X_test)]

print('Normalized Vector for Sentences are created')

100%|██████████████████████████████████████████████████████████████████████████| 40768/40768 [00:24<00:00, 1678.61it/s]
100%|██████████████████████████████████████████████████████████████████████████| 10192/10192 [00:05<00:00, 1907.77it/s]


Normalized Vector for Sentences are created
Wall time: 29.8 s


In [14]:
xtrain_glove = np.array(xtrain_glove)
xtest_glove = np.array(xtest_glove)

In [15]:
# Evaluate performance
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
classifier = LogisticRegression(C=0.01)

TP = list()
FP = list()
FN = list()
TN = list()
accuracy = list()

for category in categories:
    print('... Processing {}'.format(category))
    
    # train the model
    classifier.fit(xtrain_glove, y_train[category])
    
    # compute the testing accuracy
    prediction = classifier.predict_proba(xtest_glove)[:, 1]
    # round() because we use predict_proba here
    print('Test accuracy is {}%'.format(round(100*accuracy_score(y_test[category], prediction.round()), 2)))
    display(confusion_matrix(y_test[category], prediction.round()))
    print(classification_report(y_test[category], prediction.round()))
    
    # getting precision, recall and F1-score    
    cm = confusion_matrix(y_test[category], prediction.round())
    TN_i = cm[0][0]
    FP_i = cm[0][1]
    FN_i = cm[1][0]
    TP_i = cm[1][1]
    
    TP.append(TP_i)
    FP.append(FP_i)
    FN.append(FN_i)
    TN.append(TN_i)
    
    precision = sum(TP) / (sum(TP) + sum(FP))
    recall = sum(TP) / (sum(TP) + sum(FN))
    f1_score = 2 * precision * recall / (precision + recall)

    print("Precision: {}%".format(np.round(100*precision, 2)))
    print("Recall: {}%".format(np.round(100*recall, 2)))
    print("F1 score: {}%".format(np.round(100*f1_score, 2))) 
    print("=============================================================")
    
    accuracy_i = (TP_i + TN_i) / (TP_i + TN_i + FN_i + FP_i)
    accuracy.append(accuracy_i)

... Processing toxic




Test accuracy is 92.3%


array([[9203,    8],
       [ 777,  204]], dtype=int64)

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      9211
           1       0.96      0.21      0.34       981

    accuracy                           0.92     10192
   macro avg       0.94      0.60      0.65     10192
weighted avg       0.93      0.92      0.90     10192

Precision: 96.23%
Recall: 20.8%
F1 score: 34.2%
... Processing severe_toxic




Test accuracy is 99.07%


array([[10097,     0],
       [   95,     0]], dtype=int64)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.99      1.00      1.00     10097
           1       0.00      0.00      0.00        95

    accuracy                           0.99     10192
   macro avg       0.50      0.50      0.50     10192
weighted avg       0.98      0.99      0.99     10192

Precision: 96.23%
Recall: 18.96%
F1 score: 31.68%
... Processing obscene




Test accuracy is 95.28%


array([[9639,    4],
       [ 477,   72]], dtype=int64)

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      9643
           1       0.95      0.13      0.23       549

    accuracy                           0.95     10192
   macro avg       0.95      0.57      0.60     10192
weighted avg       0.95      0.95      0.94     10192

Precision: 95.83%
Recall: 16.98%
F1 score: 28.86%
... Processing threat




Test accuracy is 99.69%


array([[10160,     0],
       [   32,     0]], dtype=int64)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10160
           1       0.00      0.00      0.00        32

    accuracy                           1.00     10192
   macro avg       0.50      0.50      0.50     10192
weighted avg       0.99      1.00      1.00     10192

Precision: 95.83%
Recall: 16.66%
F1 score: 28.38%
... Processing insult




Test accuracy is 95.38%


array([[9673,    5],
       [ 466,   48]], dtype=int64)

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      9678
           1       0.91      0.09      0.17       514

    accuracy                           0.95     10192
   macro avg       0.93      0.55      0.57     10192
weighted avg       0.95      0.95      0.94     10192

Precision: 95.01%
Recall: 14.92%
F1 score: 25.8%
... Processing identity_hate




Test accuracy is 99.04%


array([[10094,     0],
       [   98,     0]], dtype=int64)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.99      1.00      1.00     10094
           1       0.00      0.00      0.00        98

    accuracy                           0.99     10192
   macro avg       0.50      0.50      0.50     10192
weighted avg       0.98      0.99      0.99     10192

Precision: 95.01%
Recall: 14.28%
F1 score: 24.83%


In [16]:
precision = list()
recall = list()
f1_score = list()
fpr = dict()
tpr = dict()
roc_auc = dict()
no_of_labels = 6

for i in range(no_of_labels):
    precision_i = (TP[i]) / ((TP[i] + FP[i]) or not (TP[i] + FP[i]))
    precision.append(precision_i)

    recall_i = (TP[i]) / ((TP[i] + FN[i]) or not (TP[i] + FN[i]))
    recall.append(recall_i)

    f1_score_i = (2 * precision_i * recall_i) / ((precision_i + recall_i) or not (precision_i + recall_i))
    f1_score.append(f1_score_i)

micro_precision = sum(TP) / (sum(TP) + sum(FP))
micro_recall = sum(TP) / (sum(TP) + sum(FN))
micro_f1_score = (2 * micro_precision * micro_recall) / (micro_precision + micro_recall)
avg_accuracy = sum(accuracy) / no_of_labels

print("Micro Precision: {}%".format(round(100*micro_precision, 2)))
print("Micro Recall: {}%".format(round(100*micro_recall, 2)))
print("Micro F1-score: {}%".format(round(100*micro_f1_score, 2)))
print("Average Accuracy: {}%".format(round(100*avg_accuracy, 2)))

Micro Precision: 95.01%
Micro Recall: 14.28%
Micro F1-score: 24.83%
Average Accuracy: 96.79%
