# Davidson Dataset Hate Speech Detection

## Imports and data loading

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

import tensorflow as tf
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer()
Input = tf.keras.layers.Input
Embedding = tf.keras.layers.Embedding
Dense = tf.keras.layers.Dense
Concatenate = tf.keras.layers.Concatenate
Model = tf.keras.Model
simpleRNN = tf.keras.layers.SimpleRNN
LSTM = tf.keras.layers.LSTM
GRU = tf.keras.layers.GRU
dropout = tf.keras.layers.Dropout

import nltk
from nltk.corpus import stopwords
""" import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:y

    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context """

nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wnl = WordNetLemmatizer()

import re
import numpy as np

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, roc_curve


Data definition:

count = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF).

hate_speech = number of CF users who judged the tweet to be hate speech.

offensive_language = number of CF users who judged the tweet to be offensive.

neither = number of CF users who judged the tweet to be neither offensive nor non-offensive.

class = class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither

In [None]:
david_df = pd.read_csv('data/davidson.csv')
david_df.head()

## EDA and Preprocess definitions

There are no null values in the dataset

In [None]:
david_df.isnull().sum()

In [None]:
stopwords = stopwords.words('english')
stopwords.append('&amp;') # &amp; means and
def clean(df):
    df['tweet'] = df['tweet'].apply(lambda x: x.lower()) #lowercase
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)])) #stopwords removal
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'@[A-Za-z0-9]*', 'MENTION', x)) #replace all @mentions to 'MENTION'
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', 'URL', x)) #replace all urls to 'URL'
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'[^\w\s]', '', x)) #remove punctuation
    punct = '!"$%&\'()*+,-./:;<=>?[\\]^_{|}~'
    df['tweet'] = df['tweet'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punct)))
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([wnl.lemmatize(word, pos='a') for word in x.split()])) #lemmatize on the basis of adjectives
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([wnl.lemmatize(word, pos='v') for word in x.split()])) #lemmatize on the basis of verbs
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([wnl.lemmatize(word, pos='n') for word in x.split()])) #lemmatize on the basis of noun
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'\bRT\b', '', x)) #remove RT
    return df

In [None]:
MAX_SEQ_LEN = 25
def tokenize(df):
    df['tweet'] = tokenizer.texts_to_sequences(df['tweet'])
    vocab_size = len(tokenizer.word_index) + 1
    #print('Vocabulary size: {}'.format(vocab_size))
    padded_tweet = pad_sequences(df['tweet'].tolist(), padding='post', maxlen=MAX_SEQ_LEN)
    return padded_tweet, vocab_size

def preprocess(df):
    df = clean(df)
    df, vocab_size = tokenize(df)
    return df, vocab_size

## Splitting the dataset and calling preprocessing functions

1. Tokenizer is fit on the training data only, which is used to transform both the training and test data to maintain the integrity of the val set as truly unseen data. This avoids data leakage.

In [None]:
X = david_df.drop(columns=['class'])
Y = david_df['class']
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=54)
print("Training Shape:", X_train.shape)
print("Validation Shape:", X_val.shape) 

Even after splitting the dataset, the distribution, although imbalanced, remains more or less consistent throughout original, train, val and test dataset. Hence, this splitting is correct.

In [None]:
plt.suptitle('Class Distribution')
plt.figure(figsize=(15, 5))
explode = (0, 0, 0.1)
plt.subplot(1, 3, 1)
david_df['class_labels'] = david_df['class'].map({0: 'Hate', 1: 'Offensive', 2: 'Neither'})
david_df['class_labels'].value_counts().plot(kind='pie', title='Original', colormap='Accent', 
                                      autopct='%1.1f%%',
                                      explode=explode,
                                      )
plt.subplot(1, 3, 2)
Y_train.value_counts().plot.pie(title='Training', colormap='Accent', autopct='%1.1f%%',
                                explode=explode)
plt.subplot(1, 3, 3)
Y_val.value_counts().plot.pie(title='Validation', colormap='Accent', autopct='%1.1f%%',
                                explode=explode)
plt.show()
david_df.drop(columns=['class_labels'], inplace=True)

In [None]:
tokenizer.fit_on_texts(X_train['tweet'])
X_train, vocab_size = preprocess(X_train)
X_val, _ = preprocess(X_val)

In [None]:
# import os
# import requests
# import zipfile

# # Define the URL for the GloVe embeddings
# glove_url = "http://nlp.stanford.edu/data/glove.twitter.27B.zip"
# glove_zip_file = "glove.twitter.27B.zip"

# # Download the embeddings
# response = requests.get(glove_url, stream=True)
# with open(glove_zip_file, "wb") as file:
#     for chunk in response.iter_content(chunk_size=128):
#         file.write(chunk)

# # Extract the embeddings
# with zipfile.ZipFile(glove_zip_file, "r") as zip_ref:
#     zip_ref.extractall("glove_embeddings")


In [None]:
word_index = tokenizer.word_index
embedding_index = {}
with open('glove_embeddings/glove.twitter.27B.200d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
# Create embedding matrix
embedding_dim = 200
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(Y_train), y=Y_train.astype(int))
class_weights = dict(enumerate(class_weights))
#If class weight is used, y needs to be one-hot encoded
Y_train_encoded = tf.keras.utils.to_categorical(Y_train, num_classes=3)
Y_val_encoded = tf.keras.utils.to_categorical(Y_val, num_classes=3)

## Baseline Models: Simple_RNN, LSTM and GRU

### Models

In [None]:
learning_rate = 0.00003

In [None]:
def model_LSTM(vocab_size, input_length=MAX_SEQ_LEN):
    text_input = Input(shape=(input_length,), name='text_input')
    x = Embedding(input_dim=vocab_size+1, output_dim=128, input_length=input_length)(text_input)
    x = LSTM(500, return_sequences=True)(x)
    x = dropout(0.8)(x)
    x = LSTM(300)(x)
    x = dropout(0.5)(x)
    output = Dense(3, activation='softmax')(x)
    model = Model(inputs=[text_input], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [None]:
def model_RNN(vocab_size, input_length=MAX_SEQ_LEN):
    text_input = Input(shape=(input_length,), name='text_input')
    x = Embedding(input_dim=vocab_size+1, output_dim=128, input_length=MAX_SEQ_LEN)(text_input)
    x = simpleRNN(500)(x)
    x = dropout(0.8)(x)
    #add_input = Input(shape=(4,), name='additional_input')
    #x = Concatenate()([x, add_input])
    #x = Dense(64, activation='relu')(x)
    output = Dense(3, activation='softmax')(x)
    model = Model(inputs=[text_input], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
def model_GRU(vocab_size, input_length=MAX_SEQ_LEN):
    learning_rate = 0.00001
    text_input = Input(shape=(input_length,), name='text_input')
    x = Embedding(input_dim=vocab_size+1, output_dim=128, input_length=MAX_SEQ_LEN)(text_input)
    x = GRU(500)(x)
    x = dropout(0.8)(x)
    #add_input = Input(shape=(4,), name='additional_input')
    #x = Concatenate()([x, add_input])
    #x = Dense(64, activation='relu')(x)
    output = Dense(3, activation='softmax')(x)
    model = Model(inputs=[text_input], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
def novel_model(vocab_size, input_length=MAX_SEQ_LEN, learning_rate=0.00003):
    Bidirectional = tf.keras.layers.Bidirectional
    model = tf.keras.Sequential()
    model.add(Embedding(input_dim=vocab_size, 
                        output_dim=200, 
                        weights=[embedding_matrix], 
                        input_length=input_length, 
                        trainable=False))
    model.add(Bidirectional(GRU(40, activation='relu', return_sequences=True, name='BiDirectionalGRUlayer'))) #GRU layer with 40 units
    model.add(dropout(0.4)) #Dropout layer to prevent overfitting
    model.add(Bidirectional(LSTM(20, recurrent_activation='relu', return_sequences=True))) #LSTM layer with 20 units
    model.add(dropout(0.4)) #Dropout layer to prevent overfitting
    model.add(Dense(15)) #Dense layer with 16 units and relu activation function
    model.add(dropout(0.4)) #Dropout layer to prevent overfitting
    model.add(simpleRNN(10)) #SimpleRNN layer with 10 units
    model.add(dropout(0.4)) #Dropout layer to prevent overfitting
    model.add(Dense(10)) #Dense layer with 16 units and relu activation function
    model.add(Dense(3, activation='softmax')) #Output layer with 3 units and softmax activation function
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, use_ema=True, ema_momentum=0.9)
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

### Train and Test definitions

In [None]:
def train(model, X_train, Y_train, X_val, Y_val, epochs=100, batch_size=64):
      callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=6,
                                                      restore_best_weights=True)
      history = model.fit(X_train, Y_train, 
            validation_data=(X_val, Y_val), 
            epochs=epochs, batch_size=batch_size, 
            class_weight=class_weights,
            callbacks=callback,
            verbose=2)
      print("\n\n****************************\n\n")
      print("Model trained successfully")
      pred = model.predict(X_val, batch_size=batch_size, verbose=1, steps=None)
      y_true = np.argmax(Y_val, axis=1)
      y_pred = np.argmax(pred, axis=1)
      precision = precision_score(y_true, y_pred, average='weighted')
      recall = recall_score(y_true, y_pred, average='weighted')
      f1 = f1_score(y_true, y_pred, average='weighted')
      # Calculate AUC and plot ROC curve
      if Y_val.shape[1] == 2:  # Binary classification case
        auc = roc_auc_score(Y_val[:, 1], pred[:, 1])
        fpr, tpr, _ = roc_curve(Y_val[:, 1], pred[:, 1])
      else:  # Multi-class case (one-vs-rest approach)
        auc = roc_auc_score(Y_val, pred, multi_class='ovr', average='weighted')
        fpr, tpr, _ = roc_curve(Y_val.ravel(), pred.ravel())
    
      print("AUC: ", auc)
      print("Precision: ", precision, "Recall: ", recall, "F1 Score: ", f1)
      print("Validation Accuracy: ", model.evaluate(X_val, Y_val, batch_size=batch_size, verbose=1))
      
      # Plot ROC curve
      plt.figure(figsize=(8, 6))
      plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {auc:.2f})')
      plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
      plt.xlim([0.0, 1.0])
      plt.ylim([0.0, 1.05])
      plt.xlabel('False Positive Rate')
      plt.ylabel('True Positive Rate')
      plt.title('Receiver Operating Characteristic (ROC) Curve')
      plt.legend(loc='lower right')
      plt.show()
      
      return history

In [None]:
def evaluate_model(history):
    train_accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    plt.figure(figsize=(20, 6))

    plt.subplot(1, 2, 1)
    plt.plot(range(0, len(train_accuracy)), train_accuracy, 'b-', linewidth=2, label='Training Accuracy')
    plt.plot(range(0, len(val_accuracy)), val_accuracy, 'r-', linewidth=2, label='Validation Accuracy')
    plt.title('Training & validation accuracy over epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')

    plt.subplot(1, 2, 2)
    plt.plot(range(0, len(train_loss)), train_loss, 'b-', linewidth=2, label='Training Loss')
    plt.plot(range(0, len(val_loss)), val_loss, 'r-', linewidth=2, label='Validation Loss')
    plt.title('Training & validation loss over epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(loc='best')

    plt.show()

### Training of BaseLine Models

In [None]:
m = model_LSTM(vocab_size)
history_LSTM = train(m, X_train, Y_train_encoded, X_val, Y_val_encoded, epochs=100, batch_size=512) 

In [None]:
evaluate_model(history_LSTM)

In [None]:
m = model_RNN(vocab_size, input_length=MAX_SEQ_LEN)
history_RNN = train(m, X_train, Y_train_encoded, X_val, Y_val_encoded, epochs=100, batch_size=512) 

In [None]:
evaluate_model(history_RNN)

In [None]:
m = model_GRU(vocab_size, input_length=MAX_SEQ_LEN)
history_GRU = train(m, X_train, Y_train_encoded, X_val, Y_val_encoded, epochs=100, batch_size=256) 

In [None]:
evaluate_model(history_GRU)

### Training of Novel Model

In [None]:
m = novel_model(vocab_size, input_length=MAX_SEQ_LEN, learning_rate=0.00003)
history = train(m, X_train, Y_train_encoded, X_val, Y_val_encoded, epochs=100, batch_size=512)  

In [None]:
evaluate_model(history)