# Davidson Dataset Hate Speech Detection

## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, log_loss
from sklearn.utils import class_weight
from sklearn.preprocessing import StandardScaler

import tensorflow as tf

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wnl = WordNetLemmatizer()

import re
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alkakumari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alkakumari/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
david_df = pd.read_csv('data/davidson.csv')
david_df.describe()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0,24783.0
mean,12681.192027,3.243473,0.280515,2.413711,0.549247,1.110277
std,7299.553863,0.88306,0.631851,1.399459,1.113299,0.462089
min,0.0,3.0,0.0,0.0,0.0,0.0
25%,6372.5,3.0,0.0,2.0,0.0,1.0
50%,12703.0,3.0,0.0,3.0,0.0,1.0
75%,18995.5,3.0,0.0,3.0,0.0,1.0
max,25296.0,9.0,7.0,9.0,9.0,2.0


Data definition:

count = number of CrowdFlower users who coded each tweet (min is 3, sometimes more users coded a tweet when judgments were determined to be unreliable by CF).

hate_speech = number of CF users who judged the tweet to be hate speech.

offensive_language = number of CF users who judged the tweet to be offensive.

neither = number of CF users who judged the tweet to be neither offensive nor non-offensive.

class = class label for majority of CF users. 0 - hate speech 1 - offensive language 2 - neither

In [3]:
david_df.isnull().sum()

Unnamed: 0            0
count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

There are no null values in the dataset

## Preprocess

In [4]:
stopwords = stopwords.words('english')
stopwords.append('&amp;') # &amp; means and
def clean(df):
    df['tweet'] = df['tweet'].apply(lambda x: x.lower())
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'@[A-Za-z0-9]*', 'MENTION', x))
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', 'URL', x))
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([wnl.lemmatize(word, pos='a') for word in x.split()])) #adjectives
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([wnl.lemmatize(word, pos='v') for word in x.split()])) #verbs
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([wnl.lemmatize(word, pos='n') for word in x.split()])) #noun
    return df

In [5]:
def tokenize(df):
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df['tweet'])
    df['tweet'] = tokenizer.texts_to_sequences(df['tweet'])
    vocab_size = len(tokenizer.word_index) + 1
    return df, vocab_size

In [6]:
def preprocess(df):
    df = clean(df)
    df, vocab_size = tokenize(df)
    return df, vocab_size

In [7]:
preprocessed_df, vocab_size = preprocess(david_df)
preprocessed_df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,"[3, 1, 110, 620, 550, 228, 39, 92, 43, 18, 347]"
1,1,3,0,3,0,1,"[3, 1, 100, 79, 8829, 6079, 33, 2146, 79, 4, 7..."
2,2,3,0,3,0,1,"[3, 1, 683, 3, 1, 111, 8, 2, 125, 259, 790, 15]"
3,3,3,0,2,1,1,"[3, 1, 602, 4778, 1, 3411, 24, 7, 592]"
4,4,6,0,6,0,1,"[3, 1, 15, 208, 254, 448, 254, 4779, 2, 41, 47..."


In [8]:
X = preprocessed_df.drop(columns=['class'])
Y = preprocessed_df['class']
#X_train_text, X_test_text, X_train_add, X_test_add, y_train, y_test = train_test_split(
    #tweet_padded, additional_features_scaled, df['class'], test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=54)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.125, random_state=54) # 0.125 x 0.8 = 0.1
print(X_train.shape, X_val.shape, X_test.shape)

(21685, 6) (4957, 6) (3098, 6)


In [9]:
train_padded_tweet = tf.keras.preprocessing.sequence.pad_sequences(X_train['tweet'].tolist(), padding='post', maxlen=50)
val_padded_tweet = tf.keras.preprocessing.sequence.pad_sequences(X_val['tweet'].tolist(), padding='post', maxlen=50)
test_padded_tweet = tf.keras.preprocessing.sequence.pad_sequences(X_test['tweet'].tolist(), padding='post', maxlen=50)

In [10]:
train_padded_tweet.shape

(21685, 50)

In [11]:
scaler = StandardScaler()
train_add_features = X_train[['count', 'hate_speech', 'offensive_language', 'neither']].values
val_add_features = X_val[['count', 'hate_speech', 'offensive_language', 'neither']].values
test_add_features = X_test[['count', 'hate_speech', 'offensive_language', 'neither']].values

train_add_features_scaled = scaler.fit(train_add_features).transform(train_add_features)
val_add_features_scaled = scaler.fit(val_add_features).transform(val_add_features)
test_add_features_scaled = scaler.fit(test_add_features).transform(test_add_features)

In [12]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(Y_train), y=Y_train)
print(class_weights)

[5.91516639 0.42954203 1.98853737]


In [13]:
print(vocab_size)

23565


In [14]:
print(type(train_padded_tweet), type(train_add_features_scaled))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [15]:
def model_LSTM(vocab_size, input_length=50):
    learning_rate = 0.0001
    Input = tf.keras.layers.Input
    Embedding = tf.keras.layers.Embedding
    LSTM = tf.keras.layers.LSTM
    Dense = tf.keras.layers.Dense
    Dropout = tf.keras.layers.Dropout
    Concatenate = tf.keras.layers.Concatenate
    Model = tf.keras.Model

    text_input = Input(shape=(input_length,), name='text_input')
    x = Embedding(input_dim=vocab_size+1, output_dim=256, input_length=input_length)(text_input)
    x = LSTM(128)(x)
    add_input = Input(shape=(4,), name='additional_input')
    x = Concatenate()([x, add_input])
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='softmax')(x)
    model = Model(inputs=[text_input, add_input], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_logarithmic_error', metrics=['accuracy'])
    return model

In [16]:
def model_RNN(vocab_size, input_length=50):
    learning_rate = 0.0001
    Input = tf.keras.layers.Input
    Embedding = tf.keras.layers.Embedding
    Dense = tf.keras.layers.Dense
    Dropout = tf.keras.layers.Dropout
    Concatenate = tf.keras.layers.Concatenate
    Model = tf.keras.Model
    simpleRNN = tf.keras.layers.SimpleRNN

    text_input = Input(shape=(input_length,), name='text_input')
    x = Embedding(input_dim=vocab_size+1, output_dim=256, input_length=input_length)(text_input)
    x = simpleRNN(128)(x)
    add_input = Input(shape=(4,), name='additional_input')
    x = Concatenate()([x, add_input])
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='softmax')(x)
    model = Model(inputs=[text_input, add_input], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_logarithmic_error', metrics=['accuracy'])
    return model

In [17]:
def model_GRU(vocab_size, input_length=50):
    learning_rate = 0.0001
    Input = tf.keras.layers.Input
    Embedding = tf.keras.layers.Embedding
    GRU = tf.keras.layers.GRU
    Dense = tf.keras.layers.Dense
    Dropout = tf.keras.layers.Dropout
    Concatenate = tf.keras.layers.Concatenate
    Model = tf.keras.Model

    text_input = Input(shape=(input_length,), name='text_input')
    x = Embedding(input_dim=vocab_size+1, output_dim=256, input_length=input_length)(text_input)
    x = GRU(128)(x)
    add_input = Input(shape=(4,), name='additional_input')
    x = Concatenate()([x, add_input])
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='softmax')(x)
    model = Model(inputs=[text_input, add_input], outputs=output)
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mean_squared_logarithmic_error', metrics=['accuracy'])
    return model

In [18]:
def train(model, X_train, Y_train, X_val, Y_val, epochs=25, batch_size=128):
    model.fit(X_train, Y_train, 
          validation_data=(X_val, Y_val), 
          epochs=epochs, batch_size=64, 
          verbose=1)
    print("\n\n****************************\n\n")
    print("Model trained successfully")
    pred = model.predict(X_val, batch_size=batch_size, verbose=1, steps=None)
    print("Predictions: ", np.round(pred, decimals=2))
    print("Validation Accuracy: ", model.evaluate(X_val, Y_val, batch_size=batch_size, verbose=1))
    return model

In [19]:
X_train = [train_padded_tweet, train_add_features_scaled]
X_val = [val_padded_tweet, val_add_features_scaled]
m = model_LSTM(vocab_size, input_length=50)
model = train(m, X_train, Y_train, X_val, Y_val, epochs=5, batch_size=128)

Epoch 1/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 76ms/step - accuracy: 0.7726 - loss: 0.0562 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 2/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 75ms/step - accuracy: 0.7779 - loss: 0.0542 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 3/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 77ms/step - accuracy: 0.7804 - loss: 0.0537 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 4/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.7740 - loss: 0.0550 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 5/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 80ms/step - accuracy: 0.7729 - loss: 0.0549 - val_accuracy: 0.7678 - val_loss: 0.0579


****************************


Model trained successfully
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
Predictions:  [[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [

In [20]:
X_train = [train_padded_tweet, train_add_features_scaled]
X_val = [val_padded_tweet, val_add_features_scaled]
m = model_RNN(vocab_size, input_length=50)
model = train(m, X_train, Y_train, X_val, Y_val, epochs=5, batch_size=128)

Epoch 1/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 45ms/step - accuracy: 0.7789 - loss: 0.0536 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 2/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 45ms/step - accuracy: 0.7778 - loss: 0.0542 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 3/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 43ms/step - accuracy: 0.7759 - loss: 0.0548 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 4/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 43ms/step - accuracy: 0.7779 - loss: 0.0542 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 5/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 43ms/step - accuracy: 0.7774 - loss: 0.0542 - val_accuracy: 0.7678 - val_loss: 0.0579


****************************


Model trained successfully
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
Predictions:  [[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [

In [21]:
X_train = [train_padded_tweet, train_add_features_scaled]
X_val = [val_padded_tweet, val_add_features_scaled]
m = model_GRU(vocab_size, input_length=50)
model = train(m, X_train, Y_train, X_val, Y_val, epochs=5, batch_size=128)

Epoch 1/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 73ms/step - accuracy: 0.7771 - loss: 0.0548 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 2/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.7684 - loss: 0.0570 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 3/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 74ms/step - accuracy: 0.7721 - loss: 0.0547 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 4/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 78ms/step - accuracy: 0.7786 - loss: 0.0534 - val_accuracy: 0.7678 - val_loss: 0.0579
Epoch 5/5
[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 81ms/step - accuracy: 0.7781 - loss: 0.0537 - val_accuracy: 0.7678 - val_loss: 0.0579


****************************


Model trained successfully
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step
Predictions:  [[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [