In [None]:
%pip install --upgrade keras

In [56]:
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, BatchNormalization, Dropout, Masking
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.regularizers import l2
import numpy as np
import pickle


In [3]:
# To load the list or array back
with open('hatexplain_data.pickle', 'rb') as file:
    data = pickle.load(file)

In [75]:
train_documents = data["train_documents"]
train_documents = [" ".join(tokens) for tokens in train_documents]
test_documents = data["test_documents"]
test_documents = [" ".join(tokens) for tokens in test_documents]
y_train = data["y_train"]
y_test = data["y_test"]

In [76]:
# Assuming training_documents and test_documents are lists of text documents
all_documents = train_documents + test_documents

# Initialize the tokenizer with no limit on the number of words (vocab_size)
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(all_documents)

# Tokenize the documents to find the maximum document length
sequences = tokenizer.texts_to_sequences(all_documents)
max_length = max(len(x) for x in sequences)

# Find the vocabulary size
# The tokenizer's word_index contains all unique tokens + 1 for the OOV token
vocab_size = len(tokenizer.word_index) + 1

print(f"Vocabulary Size: {vocab_size}")
print(f"Maximum Document Length: {max_length}")

Vocabulary Size: 26804
Maximum Document Length: 88


<h1>Bi-LSTM Hate Speech Model</h1>

In [89]:
# Assuming you have your documents and labels loaded in `documents` and `labels`
# and your labels are one-hot encoded for the categories ['hate', 'normal', 'offensive']

# Parameters
vocab_size = 26800  # Adjust based on your vocabulary size
embedding_dim = 300  # Size of the embedding vector
max_length = 70  # Max length of the documents
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"  # for out-of-vocabulary tokens
dropout_rate = 0.5

# Tokenize the documents
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_documents)
sequences = tokenizer.texts_to_sequences(train_documents)
X_train = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Build the model
model = Sequential([
    Embedding(vocab_size, embedding_dim),
    Dropout(0.5),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

optimizer = Adam(learning_rate=0.0001)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])



In [78]:
print(X_train.shape)
print(X_train[10000])

(15383, 70)
[219  59  16 640 243   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [90]:
model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

Epoch 1/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step - accuracy: 0.3965 - loss: 1.0891 - val_accuracy: 0.4053 - val_loss: 1.0755
Epoch 2/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.4403 - loss: 1.0528 - val_accuracy: 0.5678 - val_loss: 0.9218
Epoch 3/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.5896 - loss: 0.8817 - val_accuracy: 0.5964 - val_loss: 0.8457
Epoch 4/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - accuracy: 0.6453 - loss: 0.7745 - val_accuracy: 0.6139 - val_loss: 0.8215
Epoch 5/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.6745 - loss: 0.7243 - val_accuracy: 0.6220 - val_loss: 0.8314


<keras.src.callbacks.history.History at 0x7892c6f057e0>

In [83]:
model.fit(X_train, y_train, epochs=5, batch_size=128)

Epoch 1/5
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 23ms/step - accuracy: 0.4061 - loss: 1.0863
Epoch 2/5
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 19ms/step - accuracy: 0.4939 - loss: 1.0024
Epoch 3/5
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.6141 - loss: 0.8182
Epoch 4/5
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.6600 - loss: 0.7402
Epoch 5/5
[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.6937 - loss: 0.6815


<keras.src.callbacks.history.History at 0x7892c03f2950>

In [84]:
test_sequences = tokenizer.texts_to_sequences(test_documents)
X_test = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [86]:
pred = model.predict(X_test)

[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step


In [87]:
y_pred = np.argmax(pred, axis=1)

In [88]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.72      0.73      0.72      1187
           1       0.70      0.64      0.67      1563
           2       0.43      0.47      0.45      1096

    accuracy                           0.62      3846
   macro avg       0.61      0.61      0.61      3846
weighted avg       0.63      0.62      0.62      3846

