### Transformer Based Encoding

In [None]:
!pip install transformers==2.3.0

In [None]:
# from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Loading the data

In [None]:
def getpreferredencoding(do_setlocale = True):
  return "UTF-8"
import locale
locale.getpreferredencoding = getpreferredencoding

In [None]:

path = '/content/drive/MyDrive/Capstone Data/Phase 2/final_data.pkl'
train_df = pd.read_pickle(path, compression = 'gzip')

features = ['sentence_count', 'word_count', 'unique_word_count', 
            'length', 'punctuation_count', 'upper_case_count', 
            'stopword_count', '#_count', 'unique_word_count_percent', 
            'Punctuation_percent', 'ip_count','link_count', 
            'article_id_count', 'username_count', 'clean_comment']

target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat',
               'insult', 'identity_hate']
train_df = train_df.dropna()

# Seperating features and target variable
x = train_df[features]
y = train_df[target_cols]


In [None]:
len(train_df[train_df['Sum']>0])

16225

#### Supporting Functions

In [None]:
def evaluation_metrics(y_test, y_pred):
    result = {}
    result['Accuracy'] = accuracy_score(y_test, y_pred)
    result['Precision'] = precision_score(y_test, y_pred, average='weighted')
    result['Recall'] = recall_score(y_test, y_pred, average='weighted')
    result['F1 Score'] = f1_score(y_test, y_pred, average='weighted')
    return result

#### BERT Encoding

In [None]:
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 128

def tokenize_sentences(sentences, tokenizer, max_seq_len = 128):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

input_ids = tokenize_sentences(x['clean_comment'], tokenizer, MAX_LEN)
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
attention_masks = create_attention_masks(input_ids)

  0%|          | 0/159571 [00:00<?, ?it/s]

In [None]:
features.remove('clean_comment')

In [None]:
merged_x = np.hstack((attention_masks, np.array(x[features])))
merged_x.shape

(159571, 142)

In [None]:
# Splitting the data into train and test dataset
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(np.array(merged_x), np.array(y), 
                                            test_size = 0.2, random_state = 2)

In [None]:
no_rows = train_x.shape[0]
no_cols = train_x.shape[1]

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

train_y_df = pd.DataFrame(train_y)
test_y_df = pd.DataFrame(test_y)
result_df = pd.DataFrame(['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])
result_list = []
for i in range(6):
  train_target = np.array(train_y_df[i]).reshape(-1,1)
  test_target = np.array(test_y_df[i]).reshape(-1,1)
  model = LogisticRegression(random_state=0).fit(train_x, train_target)
  pred = model.predict(test_x)
  pred = np.where(pred > 0.5, 1, 0)
  res = evaluation_metrics(test_target,pred)
  res = list(res.values())
  res.insert(0, target_cols[i])
  result_list.append(res) 

pd.DataFrame(result_list, columns=['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

Unnamed: 0,Target Variable,Accuracy,Precision,Recall,F1 Score
0,toxic,0.904308,0.883525,0.904308,0.86185
1,severe_toxic,0.989221,0.984576,0.989221,0.984193
2,obscene,0.947548,0.930614,0.947548,0.922909
3,threat,0.997149,0.994368,0.997149,0.995756
4,insult,0.951465,0.929876,0.951465,0.928532
5,identity_hate,0.991634,0.983338,0.991634,0.987469


#### Gaussian NB

In [None]:
from sklearn.naive_bayes import GaussianNB


train_y_df = pd.DataFrame(train_y)
test_y_df = pd.DataFrame(test_y)
result_df = pd.DataFrame(['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])
result_list = []
for i in range(6):
  train_target = np.array(train_y_df[i]).reshape(-1,1)
  test_target = np.array(test_y_df[i]).reshape(-1,1)
  clf = GaussianNB()
  clf.fit(train_x, train_target)
  pred = clf.predict(test_x)
  pred = np.where(pred > 0.5, 1, 0)
  res = evaluation_metrics(test_target,pred)
  res = list(res.values())
  res.insert(0, target_cols[i])
  result_list.append(res) 

pd.DataFrame(result_list, columns=['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Unnamed: 0,Target Variable,Accuracy,Precision,Recall,F1 Score
0,toxic,0.387467,0.865158,0.387467,0.473038
1,severe_toxic,0.515682,0.983842,0.515682,0.670096
2,obscene,0.369419,0.924221,0.369419,0.488038
3,threat,0.319348,0.995419,0.319348,0.480988
4,insult,0.34689,0.930242,0.34689,0.465881
5,identity_hate,0.338994,0.986233,0.338994,0.497455


#### Decision Tree

In [None]:
from sklearn import tree

train_y_df = pd.DataFrame(train_y)
test_y_df = pd.DataFrame(test_y)
result_df = pd.DataFrame(['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])
result_list = []
for i in range(6):
  train_target = np.array(train_y_df[i]).reshape(-1,1)
  test_target = np.array(test_y_df[i]).reshape(-1,1)
  clf = tree.DecisionTreeClassifier()
  clf.fit(train_x, train_target)
  pred = clf.predict(test_x)
  pred = np.where(pred > 0.5, 1, 0)
  res = evaluation_metrics(test_target,pred)
  res = list(res.values())
  res.insert(0, target_cols[i])
  result_list.append(res) 

pd.DataFrame(result_list, columns=['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])



Unnamed: 0,Target Variable,Accuracy,Precision,Recall,F1 Score
0,toxic,0.841705,0.84962,0.841705,0.845563
1,severe_toxic,0.979571,0.980524,0.979571,0.980045
2,obscene,0.903901,0.911762,0.903901,0.907743
3,threat,0.993608,0.994522,0.993608,0.994063
4,insult,0.909384,0.916563,0.909384,0.912907
5,identity_hate,0.981545,0.983914,0.981545,0.982722


#### Sequential NN

In [None]:
from keras.models import Sequential
from keras.layers import *
print(no_rows, no_cols)
nn_model = Sequential([
    Dense(512, input_shape=(no_cols,), activation='relu'),
    Dropout(0.2),
    # Dense(256, activation='relu'),
    # Dense(256, activation='relu'),
    Dense(256, activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid'),
])

nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics='acc')

nn_model.summary()

127656 142
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 512)               73216     
                                                                 
 dropout_28 (Dropout)        (None, 512)               0         
                                                                 
 dense_10 (Dense)            (None, 256)               131328    
                                                                 
 dropout_29 (Dropout)        (None, 256)               0         
                                                                 
 dense_11 (Dense)            (None, 64)                16448     
                                                                 
 dropout_30 (Dropout)        (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 32)    

In [None]:
train_y_df = pd.DataFrame(train_y)
test_y_df = pd.DataFrame(test_y)
result_df = pd.DataFrame(['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])
result_list = []
for i in range(6):
  train_target = np.array(train_y_df[i]).reshape(-1,1)
  test_target = np.array(test_y_df[i]).reshape(-1,1)
  history = nn_model.fit(train_x, train_target, epochs=10, 
                         batch_size=64, validation_data=(test_x, test_target))
  nn_pred = nn_model.predict(test_x)
  nn_pred = np.where(nn_pred > 0.5, 1, 0)
  res = evaluation_metrics(test_target,nn_pred)
  res = list(res.values())
  res.insert(0, target_cols[i])
  result_list.append(res) 

pd.DataFrame(result_list, columns=['Target Variable','Accuracy', 'Precision', 'Recall', 'F1 Score'])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Target Variable,Accuracy,Precision,Recall,F1 Score
0,toxic,0.907849,0.887654,0.907849,0.875459
1,severe_toxic,0.98919,0.978497,0.98919,0.983814
2,obscene,0.947392,0.937018,0.947392,0.921922
3,threat,0.99718,0.994368,0.99718,0.995772
4,insult,0.95159,0.933252,0.95159,0.928896
5,identity_hate,0.991634,0.983338,0.991634,0.987469


#### LSTM 

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Input, Bidirectional
import tensorflow as tf
from keras.models import Model
from tensorflow.keras import backend as K

In [None]:
def f1(y_true, y_pred):    
  def recall_m(y_true, y_pred):
      TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
      Positives = K.sum(K.round(K.clip(y_true, 0, 1)))
      
      recall = TP / (Positives+K.epsilon())    
      return recall 
  
  
  def precision_m(y_true, y_pred):
      TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
      Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  
      precision = TP / (Pred_Positives+K.epsilon())
      return precision 
  
  precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)
  
  return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def build_LSTM(data):
  # Define the LSTM model
  model = Sequential()
  model.add(LSTM(128, input_shape = (data.shape[1], 1), return_sequences = True))
  model.add(Dropout(0.2))
  model.add(LSTM(128, return_sequences = True))
  model.add(Dropout(0.2))
  model.add(LSTM(32))
  model.add(Dense(1, activation = 'sigmoid'))
  model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy', 'Recall', 'Precision', f1])

  return model

In [None]:
train_y_df = pd.DataFrame(train_y)
test_y_df = pd.DataFrame(test_y)
result_df = pd.DataFrame(['Target Variable','Loss', 'Accuracy', 'Recall', 'Precision', 'F1 Score'])
result_list = []
for i in range(6):
  train_target = np.array(train_y_df[i]).reshape(-1,1)
  test_target = np.array(test_y_df[i]).reshape(-1,1)
  LSTM_model = build_LSTM(train_x)
  LSTM_model.fit(train_x, train_target, epochs = 10, batch_size = 64)
  test_metrics = LSTM_model.evaluate(test_x, test_target, 
                                     batch_size = 64, verbose = 0)
  test_metrics.insert(0, target_cols[i])
  print(test_metrics)
  result_list.append(test_metrics) 

pd.DataFrame(result_list, columns=['Target Variable','Loss', 'Accuracy', 'Recall', 'Precision', 'F1 Score'])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
['toxic', 0.2829012870788574, 0.9084129929542542, 0.11711420118808746, 0.6510791182518005, 0.17465277016162872]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
['severe_toxic', 0.051314979791641235, 0.9891900420188904, 0.0, 0.0, 0.0]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
['obscene', 0.1861840933561325, 0.9475168585777283, 0.014277216047048569, 0.5714285969734192, 0.020025765523314476]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
['threat', 0.019373102113604546, 0.9971799850463867, 0.0, 0.0, 0.0]
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
['insult', 0.17726512253284454, 0.95152747631073, 0.010329244658350945, 0.5

Unnamed: 0,Target Variable,Loss,Accuracy,Recall,Precision,F1 Score
0,toxic,0.282901,0.908413,0.117114,0.651079,0.174653
1,severe_toxic,0.051315,0.98919,0.0,0.0,0.0
2,obscene,0.186184,0.947517,0.014277,0.571429,0.020026
3,threat,0.019373,0.99718,0.0,0.0,0.0
4,insult,0.177265,0.951527,0.010329,0.533333,0.011755
5,identity_hate,0.045945,0.991634,0.0,0.0,0.0
