### Step 1: Import necessary libraries

In [1]:
import os
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import pickle

### Step 2: Download NLTK data

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Step 4: Load dataset

In [10]:
df = pd.read_csv("E:/STUDY/Projects/hatespeechclassification/labeled_data.csv")

### inspect columns

In [11]:
# Inspect columns
print(df.columns)

# Display first few rows to ensure data is loaded correctly
print(df.head())

Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


###  Drop unnecessary columns and preprocess the data

In [12]:
# Define the columns to drop if they exist
columns_to_drop = ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither']

# Drop columns that exist in the DataFrame
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

# Ensure 'class' column exists before proceeding
if 'class' in df.columns:
    # Replace class values to binary (0 and 1)
    df['class'] = df['class'].replace({0: 1, 2: 0})

    # Rename the 'class' column to 'label'
    df = df.rename(columns={'class': 'label'})
else:
    print("Error: 'class' column not found in the DataFrame.")
    # You can choose to raise an error or handle it as needed
    raise KeyError("'class' column not found in the DataFrame")

# Data cleaning function
stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))

def data_cleaning(words):
    words = str(words).lower()
    words = re.sub('\[.*?\]', '', words)
    words = re.sub('https?://\S+|www\.\S+', '', words)
    words = re.sub('<.*?>+', '', words)
    words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
    words = re.sub('\n', '', words)
    words = re.sub('\w*\d\w*', '', words)
    words = [word for word in words.split(' ') if word not in stopword]
    words = " ".join(words)
    words = [stemmer.stem(word) for word in words.split(' ')]
    return " ".join(words)

df['tweet'] = df['tweet'].apply(data_cleaning)

  words = re.sub('\[.*?\]', '', words)
  words = re.sub('https?://\S+|www\.\S+', '', words)
  words = re.sub('\w*\d\w*', '', words)


### Split data into training and testing sets

In [13]:
x = df['tweet']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

### Tokenize the data

In [14]:
max_words = 50000
max_len = 300
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_matrix = pad_sequences(sequences, maxlen=max_len)

### Create the model

In [15]:
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])



### Train the model

In [16]:
history = model.fit(sequences_matrix, y_train, batch_size=128, epochs=5, validation_split=0.2)

Epoch 1/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 613ms/step - accuracy: 0.8220 - loss: 0.4344 - val_accuracy: 0.9333 - val_loss: 0.1657
Epoch 2/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 650ms/step - accuracy: 0.9375 - loss: 0.1660 - val_accuracy: 0.9494 - val_loss: 0.1268
Epoch 3/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 694ms/step - accuracy: 0.9582 - loss: 0.1126 - val_accuracy: 0.9519 - val_loss: 0.1235
Epoch 4/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 721ms/step - accuracy: 0.9691 - loss: 0.0854 - val_accuracy: 0.9556 - val_loss: 0.1207
Epoch 5/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 713ms/step - accuracy: 0.9743 - loss: 0.0748 - val_accuracy: 0.9435 - val_loss: 0.1470


### Evaluate the model

In [17]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = pad_sequences(test_sequences, maxlen=max_len)
accr = model.evaluate(test_sequences_matrix, y_test)

[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 47ms/step - accuracy: 0.9414 - loss: 0.1600


### Save the model and tokenizer

In [18]:
model.save("model.h5")
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

