# Davidson Dataset Hate Speech Detection

## Imports and data loading

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer()
Input = tf.keras.layers.Input
Embedding = tf.keras.layers.Embedding
Dense = tf.keras.layers.Dense
Concatenate = tf.keras.layers.Concatenate
Model = tf.keras.Model
simpleRNN = tf.keras.layers.SimpleRNN
LSTM = tf.keras.layers.LSTM
GRU = tf.keras.layers.GRU
dropout = tf.keras.layers.Dropout

import nltk
from nltk.corpus import stopwords
""" import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context """

nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wnl = WordNetLemmatizer()

import re
import numpy as np

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:1007)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:1007)>


In [9]:
david_df = pd.read_csv('data/davidson.csv')
david_df.describe()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089
min,0.0,3.0,0.0,0.0,0.0,0.0
25%,6372.5,3.0,0.0,2.0,0.0,1.0
50%,12703.0,3.0,0.0,3.0,0.0,1.0
75%,18995.5,3.0,0.0,3.0,0.0,1.0
max,25296.0,9.0,7.0,9.0,9.0,2.0


In [10]:
david_df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


Data definition:

count = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF).

hate_speech = number of CF users who judged the tweet to be hate speech.

offensive_language = number of CF users who judged the tweet to be offensive.

neither = number of CF users who judged the tweet to be neither offensive nor non-offensive.

class = class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither

## EDA and Preprocess definitions

In [11]:
david_df.isnull().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

There are no null values in the dataset

In [12]:
stopwords = stopwords.words('english')
stopwords.append('&amp;') # &amp; means and
def clean(df):
    df['tweet'] = df['tweet'].apply(lambda x: x.lower()) #lowercase
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)])) #stopwords removal
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'@[A-Za-z0-9]*', 'MENTION', x)) #replace all @mentions to 'MENTION'
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', 'URL', x)) #replace all urls to 'URL'
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'[^\w\s]', '', x)) #remove punctuation
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([wnl.lemmatize(word, pos='a') for word in x.split()])) #lemmatize on the basis of adjectives
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([wnl.lemmatize(word, pos='v') for word in x.split()])) #lemmatize on the basis of verbs
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([wnl.lemmatize(word, pos='n') for word in x.split()])) #lemmatize on the basis of noun
    return df

In [13]:
def tokenize(df):
    df['tweet'] = tokenizer.texts_to_sequences(df['tweet'])
    vocab_size = len(tokenizer.word_index) + 1
    return df, vocab_size
def preprocess(df):
    df = clean(df)
    df, vocab_size = tokenize(df)
    return df, vocab_size

## Splitting the dataset and applying preprocessing

1. Tokenizer is fit on the training data only, which is used to transform both the training and test data to maintain the integrity of the val set as truly unseen data. This avoids data leakage.

In [14]:
X = david_df.drop(columns=['class'])
Y = david_df['class']
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=54)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.125, random_state=54) # 0.125 x 0.8 = 0.1
tokenizer.fit_on_texts(X_train['tweet'])
print("Training Shape:", X_train.shape)
print("Validation Shape:", X_val.shape) 
print("Test Shape:", X_test.shape)

Training Shape: (21685, 6)
Validation Shape: (4957, 6)
Test Shape: (3098, 6)


In [15]:
X_train, vocab_size = preprocess(X_train)
X_val, vocab_size = preprocess(X_val)
X_test, vocab_size = preprocess(X_test)
X_train.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,tweet
20016,20459,3,1,2,0,"[4, 2305, 204, 12118, 122, 2]"
5306,5462,3,0,3,0,"[2305, 12119, 626, 2305, 42, 27, 59]"
19285,19714,3,1,2,0,"[4, 2305, 976, 221, 421, 750, 1431, 914, 880, 11]"
7747,7965,3,0,3,0,"[2, 306, 2197, 41, 82, 18, 17699]"
4108,4230,3,0,0,3,"[2305, 2825, 126, 805, 4752, 2366, 652, 9072, ..."


In [16]:
train_padded_tweet = tf.keras.preprocessing.sequence.pad_sequences(X_train['tweet'].tolist(), padding='post', maxlen=50)
val_padded_tweet = tf.keras.preprocessing.sequence.pad_sequences(X_val['tweet'].tolist(), padding='post', maxlen=50)
test_padded_tweet = tf.keras.preprocessing.sequence.pad_sequences(X_test['tweet'].tolist(), padding='post', maxlen=50)

In [17]:
scaler = StandardScaler()
train_add_features = X_train[['count', 'hate_speech', 'offensive_language', 'neither']].values
val_add_features = X_val[['count', 'hate_speech', 'offensive_language', 'neither']].values
test_add_features = X_test[['count', 'hate_speech', 'offensive_language', 'neither']].values

train_add_features_scaled = scaler.fit(train_add_features).transform(train_add_features)
val_add_features_scaled = scaler.fit(val_add_features).transform(val_add_features)
test_add_features_scaled = scaler.fit(test_add_features).transform(test_add_features)

In [18]:
X_train = [train_padded_tweet, train_add_features_scaled]
X_val = [val_padded_tweet, val_add_features_scaled]

In [19]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(Y_train), y=Y_train)
print(class_weights)

[5.91516639 0.42954203 1.98853737]


## Baseline Models: Simple_RNN, LSTM and GRU

In [20]:
MAX_SEQ_LEN = 30 #round up value of david_df['tweet'].apply(lambda x : len(x.split(' '))).quantile(0.95)
BATCH_SIZE = 128
learning_rate = 0.0001

In [21]:
def model_LSTM(vocab_size, input_length=50):
    text_input = Input(shape=(input_length,), name='text_input')
    x = Embedding(input_dim=vocab_size+1, output_dim=128, input_length=MAX_SEQ_LEN)(text_input)
    x = LSTM(64)(x)
    add_input = Input(shape=(4,), name='additional_input')
    x = Concatenate()([x, add_input])
    x = Dense(32, activation='relu')(x)
    x = dropout(0.5)(x)
    output = Dense(1, activation='softmax')(x)
    model = Model(inputs=[text_input, add_input], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_logarithmic_error', metrics=['accuracy'])
    return model

In [22]:
def model_RNN(vocab_size, input_length=50):
    text_input = Input(shape=(input_length,), name='text_input')
    x = Embedding(input_dim=vocab_size+1, output_dim=128, input_length=MAX_SEQ_LEN)(text_input)
    x = simpleRNN(128)(x)
    add_input = Input(shape=(4,), name='additional_input')
    x = Concatenate()([x, add_input])
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='softmax')(x)
    model = Model(inputs=[text_input, add_input], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_logarithmic_error', metrics=['accuracy'])
    return model

In [23]:
def model_GRU(vocab_size, input_length=50):
    text_input = Input(shape=(input_length,), name='text_input')
    x = Embedding(input_dim=vocab_size+1, output_dim=128, input_length=MAX_SEQ_LEN)(text_input)
    x = GRU(128)(x)
    add_input = Input(shape=(4,), name='additional_input')
    x = Concatenate()([x, add_input])
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='softmax')(x)
    model = Model(inputs=[text_input, add_input], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_logarithmic_error', metrics=['accuracy'])
    return model

In [24]:
def train(model, X_train, Y_train, X_val, Y_val, epochs=25, batch_size=BATCH_SIZE):
    model.fit(X_train, Y_train, 
          validation_data=(X_val, Y_val), 
          epochs=epochs, batch_size=64, 
          verbose=1)
    print("\n\n****************************\n\n")
    print("Model trained successfully")
    pred = model.predict(X_val, batch_size=batch_size, verbose=1, steps=None)
    print("Predictions: ", np.round(pred, decimals=2))
    print("Validation Accuracy: ", model.evaluate(X_val, Y_val, batch_size=batch_size, verbose=1))
    return model

In [25]:
m = model_LSTM(vocab_size)
model = train(m, X_train, Y_train, X_val, Y_val, epochs=5, batch_size=128)

Epoch 1/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 42ms/step - accuracy: 0.7738 - loss: 0.0546 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 2/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 41ms/step - accuracy: 0.7744 - loss: 0.0548 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 3/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 38ms/step - accuracy: 0.7774 - loss: 0.0543 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 4/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 39ms/step - accuracy: 0.7770 - loss: 0.0534 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 5/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 38ms/step - accuracy: 0.7745 - loss: 0.0552 - val_accuracy: 0.7678 - val_loss: 0.0579


****************************


Model trained successfully
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
Predictions:  [[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [

In [26]:
m = model_RNN(vocab_size, input_length=50)
model = train(m, X_train, Y_train, X_val, Y_val, epochs=5, batch_size=128)

Epoch 1/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 33ms/step - accuracy: 0.7793 - loss: 0.0537 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 2/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 32ms/step - accuracy: 0.7765 - loss: 0.0549 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 3/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 30ms/step - accuracy: 0.7754 - loss: 0.0550 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 4/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.7808 - loss: 0.0538 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 5/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 31ms/step - accuracy: 0.7759 - loss: 0.0550 - val_accuracy: 0.7678 - val_loss: 0.0579


****************************


Model trained successfully
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
Predictions:  [[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [

In [27]:
m = model_GRU(vocab_size, input_length=50)
model = train(m, X_train, Y_train, X_val, Y_val, epochs=5, batch_size=128)

Epoch 1/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 56ms/step - accuracy: 0.7740 - loss: 0.0552 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 2/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 55ms/step - accuracy: 0.7738 - loss: 0.0562 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 3/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 55ms/step - accuracy: 0.7771 - loss: 0.0549 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 4/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 54ms/step - accuracy: 0.7797 - loss: 0.0533 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 5/5
[1m 79/339[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m14s[0m 56ms/step - accuracy: 0.7762 - loss: 0.0557

KeyboardInterrupt: 