In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [None]:
import pandas as pd
import numpy as np
import gensim
import sklearn
import string
import math
import sys
import re

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Embedding, Activation
from keras.utils import np_utils
from keras.models import load_model

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.utils import shuffle

from string import punctuation
from collections import defaultdict

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
from gensim.scripts.glove2word2vec import glove2word2vec

## File paths

In [None]:

# Change path here

data_path = "/content/drive/MyDrive/NLP/HateSpeechDetection/tweet_data/hateful_data.csv"

glove_file = datapath('/content/drive/MyDrive/NLP/HateSpeechDetection/tweet_data/glove.twitter.27B.25d.txt')

word2vec_glove_file = get_tmpfile("glove.6B.100d.word2vec.txt")

mode_save_path = "/content/drive/MyDrive/NLP/HateSpeechDetection/saved_model.hdf5"


## Initializations

In [None]:
vocab = {}
freq = defaultdict(int)
tweets = {}

FLAGS = re.MULTILINE | re.DOTALL
EMBEDDING_DIM = 25
SEED = 42
NO_OF_FOLDS = 10
LOSS_FUN = "categorical_crossentropy"
OPTIMIZER = "adam"
EPOCHS = 50
BATCH_SIZE = 512

glove2word2vec(glove_file, word2vec_glove_file)
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [None]:
# Replace smileys, usernames, urls with appropriate tags
def get_tags(text):

    # Regex expressions for identification 
    eyes = r"[8:=;]"
    nose = r"['`\-]?"
    heart = "<3"
    urls = "https?:\/\/\S+\b|www\.(\w+\.)+\S*"
    hashtags = "#\S+"
    numbers = "[-+]?[.\d]*[\d]+[:,.\d]*"
    user_names = "@\w+"

    # replace patterns with a given tag
    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)

    text = re_sub(r"{}".format(urls), "<url>")
    text = re_sub(r"{}".format(hashtags), "<hashtag>")
    text = re_sub(r"{}".format(numbers), "<number>")
    text = re_sub(r"{}".format(user_names), "<user>")

    # Get expressions
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lol>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sad>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutral>")
    text = re_sub(r"{}".format(heart),"<heart>")
    
    return text.lower()

In [None]:
# Tokenize tweets
def tokenize(text):
  text = get_tags(text)
  words = text.translate(str.maketrans('', '', string.punctuation)).split()
  words = list(filter(lambda x:x not in STOPWORDS, words))
  return words

## Get Tweets

In [None]:
# Get tweets from csv files
def get_tweets(path):
  tweets = list()
  df = pd.read_csv(path,sep=",", names=["class", "tweet"], header=None)[1:]
  print(df)
  for i, row in df.iterrows():
    tweets.append({"text": row["tweet"], "label": row["class"]})
  return tweets

In [None]:
tweets = get_tweets(data_path)

      class                                              tweet
1         2  !!! RT @mayasolovely: As a woman you shouldn't...
2         1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...
3         1  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
4         1  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
5         1  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...     ...                                                ...
33095     2  lol at angry men writing me essays through Fac...
33096     2  at least, I'm assuming that's what it was. I r...
33097     2  Oh fuck me hard with a rusty chainsaw, another...
33098     2      OMG SHUT UP DRASKO AND BIANCA #MKR #FINALFIVE
33099     2                                   STFU drasko #MKR

[33099 rows x 2 columns]


In [None]:
# Creates batches for k-fold cross validation
def generate_batches(X, batch_size):
    n_batches = X.shape[0]/float(batch_size)
    n_batches = int(math.ceil(n_batches))
    end = int(X.shape[0]/float(batch_size)) * batch_size
    n = 0
    for i in range(0,n_batches):
        if i < n_batches - 1: 
            batch = X[i*batch_size:(i+1) * batch_size, :]
            yield batch
        else:
            batch = X[end: , :]
            n += X[end:, :].shape[0]
            yield batch

In [None]:
# Get vector representations from GloVe model for words in our vocabulary
def get_embedding_weights():
    embedding = np.zeros((len(vocab) + 1, EMBEDDING_DIM))
    for k, v in vocab.items():
        try:
            embedding[v] = word2vec_model[k]
        except:
            pass
    return embedding

In [None]:
# Get tweets that have at least one word that is present in GloVe model
def select_tweets():
    tweets = get_tweets(data_path)
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = tokenize(tweet['text'].lower())
        
        # Check if embedding already present in GLove model
        if len(list(filter(lambda x: x in word2vec_model, words))) > 0:
            tweet_return.append(tweet)

    print('Number of tweets selected:', len(tweet_return))
    return tweet_return

In [None]:
# Generate a vocabulary of words from the training tweets
def generate_vocab():
    vocab_index = 1
    for tweet in tweets:
        words = tokenize(tweet['text'].lower())
        for word in words:
            if word not in vocab:
                vocab[word] = vocab_index
                vocab_index += 1
    vocab['UNK'] = len(vocab) + 1

In [None]:
# Generate sequences for the tweets. (Creates a number matrix for the tweets)
def generate_sequence(tweets):
    y_map = {
        '0': 0,
        '1': 1,
        '2': 2
    }

    X, y = [], []
    for tweet in tweets:
        words = tokenize(tweet['text'].lower())
        seq = []
        for word in words:
            seq.append(vocab.get(word, vocab['UNK']))
        X.append(seq)
        y.append(y_map[tweet['label']])
    return X, y


In [None]:
# Build an LSTM model
def lstm_model(sequence_length, embedding_dim):
    model_variation = 'LSTM'
    model = Sequential()
    model.add(Embedding(len(vocab)+1, embedding_dim, input_length=sequence_length))
    model.add(Dropout(0.25))
    model.add(LSTM(50))
    model.add(Dropout(0.5))
    model.add(Dense(3))
    model.add(Activation('softmax'))
    model.compile(loss=LOSS_FUN, optimizer=OPTIMIZER, metrics=['accuracy'])
    print(model.summary())
    return model

### Prepare data

In [None]:
# Get tweets for training and generate sequence
tweets = select_tweets()
generate_vocab()
X, y = generate_sequence(tweets)


      class                                              tweet
1         2  !!! RT @mayasolovely: As a woman you shouldn't...
2         1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...
3         1  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
4         1  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
5         1  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...     ...                                                ...
33095     2  lol at angry men writing me essays through Fac...
33096     2  at least, I'm assuming that's what it was. I r...
33097     2  Oh fuck me hard with a rusty chainsaw, another...
33098     2      OMG SHUT UP DRASKO AND BIANCA #MKR #FINALFIVE
33099     2                                   STFU drasko #MKR

[33099 rows x 2 columns]
Number of tweets selected: 33031


In [None]:
# Pad zeroes to maintain uniform size
MAX_SEQUENCE_LENGTH = max(map(lambda x:len(x), X))
print ("max seq length is %d"%(MAX_SEQUENCE_LENGTH))

data = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
y = np.array(y)
data, y = sklearn.utils.shuffle(data, y)

max seq length is 30


In [None]:
# Get embeddings
W = get_embedding_weights()

In [None]:
# Split data 
X_train, X_testing, y_train, y_testing = train_test_split(data, y, train_size=0.85, test_size=0.15, random_state=SEED)

## Train model

In [None]:
# Print important training statistics
def get_training_stats(y_test, y_pred, precision, recall, f1, m_precision, m_recall, m_f1):
    print("Macro results...")
    print("Avg. Precision = %f" %(precision/NO_OF_FOLDS))
    print("Avg. Recall = %f" %(recall/NO_OF_FOLDS))
    print("Avg. R1 = %f" %(f1/NO_OF_FOLDS))
    print()
    print("Micro results...")
    print("Avg. Precision = %f" %(m_precision/NO_OF_FOLDS))
    print("Avg. Recall = %f" %(m_recall/NO_OF_FOLDS))
    print("Avg. F1 = %f" %(m_f1/NO_OF_FOLDS))

In [None]:
# Trains the model with 10 fold cross validation
def train_LSTM(X, y, model, inp_dim, weights, epochs=EPOCHS, batch_size=BATCH_SIZE):
    cv_object = KFold(n_splits=NO_OF_FOLDS, shuffle=True, random_state=42)
    precision, recall, f1 = 0., 0., 0.
    m_precision, m_recall, m_f1 = 0., 0., 0.
    sentence_len = X.shape[1]
    fold_number = 0
    for train_index, test_index in cv_object.split(X):
        fold_number += 1
        print("Fold number =====>  ", fold_number)
        model.layers[0].set_weights([weights])

        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        y_train = y_train.reshape((len(y_train), 1))
        X_temp = np.hstack((X_train, y_train))

        for epoch in range(epochs):
            for X_batch in generate_batches(X_temp, batch_size):
                x = X_batch[:, :sentence_len]
                y_temp = X_batch[:, sentence_len]

                try:
                    y_temp = np_utils.to_categorical(y_temp, num_classes=3)
                except Exception as e:
                    print (e)
                    print (y_temp)

                loss, acc = model.train_on_batch(x, y_temp)

        y_pred = model.predict_on_batch(X_test)
        y_pred = np.argmax(y_pred, axis=1)

        # Track weighted precison, recall, f1
        precision += precision_score(y_test, y_pred, average='weighted')
        recall += recall_score(y_test, y_pred, average='weighted')
        f1 += f1_score(y_test, y_pred, average='weighted')

        # Track micro precison, recall, f1
        m_precision += precision_score(y_test, y_pred, average='micro')
        m_recall += recall_score(y_test, y_pred, average='micro')
        m_f1 += f1_score(y_test, y_pred, average='micro')

    model.save(mode_save_path)
    get_training_stats(y_test, y_pred, precision, recall, f1, m_precision, m_recall, m_f1)

In [None]:
# Create and train LSTM model
model = lstm_model(data.shape[1], EMBEDDING_DIM)
train_LSTM(X_train, y_train, model, EMBEDDING_DIM, W)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 25)            757175    
_________________________________________________________________
dropout (Dropout)            (None, 30, 25)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                15200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 3)                 153       
_________________________________________________________________
activation (Activation)      (None, 3)                 0         
Total params: 772,528
Trainable params: 772,528
Non-trainable params: 0
__________________________________________________

## Load Model 

In [None]:
model = load_model(mode_save_path)

## Test model

In [None]:
# Print important testing statistics
def get_testing_stats(y_test, y_pred):
    print("Confusion matrix...")
    print(confusion_matrix(y_test, y_pred))
    print()
    print("Classification report...")
    print(classification_report(y_test, y_pred))
    print()
    print("Accuracy = ", accuracy_score(y_test, y_pred)*100)
    print()
    print("Micro scores...")
    print(precision_recall_fscore_support(y_test, y_pred, average="micro"))
    print()
    print("Micro scores per class")
    print(precision_recall_fscore_support(y_test, y_pred))


In [None]:
def test_model(X_testing, y_testing):
  y_pred = model.predict(X_testing)
  y_pred = np.argmax(y_pred, axis=1)
  get_testing_stats(y_testing, y_pred)


In [None]:
# Test the model
test_model(X_testing, y_testing)


Confusion matrix...
[[  80  124   23]
 [  67 2711  109]
 [   7   86 1748]]

Classification report...
              precision    recall  f1-score   support

           0       0.52      0.35      0.42       227
           1       0.93      0.94      0.93      2887
           2       0.93      0.95      0.94      1841

    accuracy                           0.92      4955
   macro avg       0.79      0.75      0.76      4955
weighted avg       0.91      0.92      0.91      4955


Accuracy =  91.60443995963674

Micro scores...
(0.9160443995963673, 0.9160443995963673, 0.9160443995963673, None)

Micro scores per class
(array([0.51948052, 0.92810681, 0.92978723]), array([0.35242291, 0.93903706, 0.94948398]), array([0.41994751, 0.93353994, 0.93953238]), array([ 227, 2887, 1841]))


## Test single sentence

In [None]:
def test_sentence(text):
  res_map = {
          0: "Hateful",
          1:'Offensive',
          2:'Neutral'
        }
  words = tokenize(text.lower())
  
  seq = []
  for word in words:
      seq.append(vocab.get(word, vocab['UNK']))

  y_predicted = model.predict(np.array([seq]))
  print(y_predicted[0])
  y_predicted = np.argmax(y_predicted, axis=1)

  print(text + "  :  " + res_map[y_predicted[0]]) 

In [None]:
test_sentence("@ard That lady is a bitch!")

[7.5882780e-03 9.9208844e-01 3.2322816e-04]
@ard That lady is a bitch!  :  Offensive


In [None]:
test_sentence("@ard I hate those bastards. Burn in hell!!")

[8.3764172e-01 1.6214721e-01 2.1106601e-04]
@ard I hate those bastards. Burn in hell!!  :  Hateful


In [None]:
test_sentence("@ard He used to be very studious!!")

[0.03756842 0.07198482 0.8904468 ]
@ard He used to be very studious!!  :  Neutral


## Baseline Models

In [None]:
def gen_data():
    y_map = {
            '0': 0,
            '1': 1,
            '2': 2
            }

    X, y = [], []
    for tweet in tweets[:10]:
        words = tokenize(tweet['text'].lower())
        emb = np.zeros(EMBEDDING_DIM)
        print(emb)
        for word in words:
            try:
                emb += word2vec_model[word]
            except:
                pass
        emb /= len(words)
        X.append(emb)
        y.append(y_map[tweet['label']])
    return X, y

In [None]:
X, y = gen_data()

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]


In [None]:
list_of_texts = [tweet['text'] for tweet in tweets]
list_of_labels = [tweet['label'] for tweet in tweets]


base_x_train, base_x_test, base_y_train, base_y_test = train_test_split(list_of_texts, list_of_labels, train_size=0.85, test_size=0.15, random_state=SEED)




In [None]:
def get_vector(X):
  vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = tokenize,    \
                             preprocessor = None, \
                             stop_words = None,
                             max_features = 300
                             ) 
                             
  vectorizer.fit(X)
  vector = vectorizer.transform(X)
  return vector

In [None]:
def test_baseline_bow(model):
  
  train_data_features=get_vector(base_x_train).toarray()
  model.fit(train_data_features, base_y_train)

  test_data_features=get_vector(base_x_test).toarray()
  result = model.predict(test_data_features)

  get_testing_stats(base_y_test, result)


### Random Forest Classifier

In [None]:
model = RandomForestClassifier(n_estimators = 300)
test_baseline_bow(model)




Confusion matrix...
[[   6   87  114]
 [  41 2103  726]
 [  25  383 1470]]

Classification report...
              precision    recall  f1-score   support

           0       0.08      0.03      0.04       207
           1       0.82      0.73      0.77      2870
           2       0.64      0.78      0.70      1878

    accuracy                           0.72      4955
   macro avg       0.51      0.51      0.51      4955
weighted avg       0.72      0.72      0.72      4955


Accuracy =  72.23007063572149

Micro scores...
(0.7223007063572149, 0.7223007063572149, 0.7223007063572149, None)

Micro scores per class
(array([0.08333333, 0.81733385, 0.63636364]), array([0.02898551, 0.73275261, 0.7827476 ]), array([0.04301075, 0.77273562, 0.70200573]), array([ 207, 2870, 1878]))


### Logistic Regression

In [None]:
model = LogisticRegression()
test_baseline_bow(model)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Confusion matrix...
[[   7   63  137]
 [  22 2007  841]
 [   7  250 1621]]

Classification report...
              precision    recall  f1-score   support

           0       0.19      0.03      0.06       207
           1       0.87      0.70      0.77      2870
           2       0.62      0.86      0.72      1878

    accuracy                           0.73      4955
   macro avg       0.56      0.53      0.52      4955
weighted avg       0.75      0.73      0.72      4955


Accuracy =  73.36024217961655

Micro scores...
(0.7336024217961655, 0.7336024217961655, 0.7336024217961654, None)

Micro scores per class
(array([0.19444444, 0.86508621, 0.62370142]), array([0.03381643, 0.69930314, 0.86315229]), array([0.05761317, 0.7734104 , 0.72414563]), array([ 207, 2870, 1878]))


### SVM

In [None]:
model = SVC()
test_baseline_bow(model)



Confusion matrix...
[[   0   74  133]
 [   0 2110  760]
 [   0  295 1583]]

Classification report...
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       207
           1       0.85      0.74      0.79      2870
           2       0.64      0.84      0.73      1878

    accuracy                           0.75      4955
   macro avg       0.50      0.53      0.51      4955
weighted avg       0.74      0.75      0.73      4955


Accuracy =  74.53077699293644

Micro scores...
(0.7453077699293643, 0.7453077699293643, 0.7453077699293643, None)

Micro scores per class
(array([0.        , 0.85114966, 0.63933764]), array([0.        , 0.73519164, 0.842918  ]), array([0.        , 0.78893251, 0.72714745]), array([ 207, 2870, 1878]))


  _warn_prf(average, modifier, msg_start, len(result))


### Linear SVC

In [None]:
model = LinearSVC()
test_baseline_bow(model)



Confusion matrix...
[[   8   51  148]
 [  20 1987  863]
 [  15  186 1677]]

Classification report...
              precision    recall  f1-score   support

           0       0.19      0.04      0.06       207
           1       0.89      0.69      0.78      2870
           2       0.62      0.89      0.73      1878

    accuracy                           0.74      4955
   macro avg       0.57      0.54      0.53      4955
weighted avg       0.76      0.74      0.73      4955


Accuracy =  74.10696266397578

Micro scores...
(0.7410696266397578, 0.7410696266397578, 0.7410696266397577, None)

Weighted scores
(0.7617204477146297, 0.7410696266397578, 0.7329433708464437, None)


### Gradient Boosting

In [45]:
model = LinearSVC()
test_baseline_bow(model)



Confusion matrix...
[[   8   51  148]
 [  20 1987  863]
 [  15  186 1677]]

Classification report...
              precision    recall  f1-score   support

           0       0.19      0.04      0.06       207
           1       0.89      0.69      0.78      2870
           2       0.62      0.89      0.73      1878

    accuracy                           0.74      4955
   macro avg       0.57      0.54      0.53      4955
weighted avg       0.76      0.74      0.73      4955


Accuracy =  74.10696266397578

Micro scores...
(0.7410696266397578, 0.7410696266397578, 0.7410696266397577, None)

Micro scores per class
(array([0.18604651, 0.89343525, 0.62388393]), array([0.03864734, 0.69233449, 0.89297125]), array([0.064     , 0.78013349, 0.73455979]), array([ 207, 2870, 1878]))
