### Step 1: Import necessary libraries

In [1]:
!pip install numpy 
!pip install pandas 
!pip install matplotlib 
!pip install seaborn 
!pip install nltk 
!pip install scikit-learn 
!pip install tensorflow

Collecting numpy<2,>=1.26.0 (from pandas)
  Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl (15.5 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.1
    Uninstalling numpy-2.0.1:
      Successfully uninstalled numpy-2.0.1
Successfully installed numpy-1.26.4


  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 2.18.0 requires fsspec[http]<=2024.2.0,>=2023.1.0, but you have fsspec 2024.6.0 which is incompatible.




In [2]:
import os
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import pickle
import string  

### Step 2: Download NLTK data

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Step 4: Load dataset

In [4]:
df = pd.read_csv("E:/STUDY/Projects/hatespeechclassification/labeled_data.csv")

### inspect columns

In [5]:
# Inspect columns
print(df.columns)

# Display first few rows to ensure data is loaded correctly
print(df.head())

Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  


###  Drop unnecessary columns and preprocess the data

In [6]:
columns_to_drop = ['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither']
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

if 'class' in df.columns:
    df['class'] = df['class'].replace({0: 1, 2: 0})
    df = df.rename(columns={'class': 'label'})
else:
    print("Error: 'class' column not found in the DataFrame.")
    raise KeyError("'class' column not found in the DataFrame")

stemmer = nltk.SnowballStemmer("english")
stopword = set(stopwords.words('english'))

def data_cleaning(words):
    words = str(words).lower()
    words = re.sub('\[.*?\]', '', words)
    words = re.sub('https?://\S+|www\.\S+', '', words)
    words = re.sub('<.*?>+', '', words)
    words = re.sub('[%s]' % re.escape(string.punctuation), '', words)
    words = re.sub('\n', '', words)
    words = re.sub('\w*\d\w*', '', words)
    words = [word for word in words.split(' ') if word not in stopword]
    words = " ".join(words)
    words = [stemmer.stem(word) for word in words.split(' ')]
    return " ".join(words)

df['tweet'] = df['tweet'].apply(data_cleaning)

  words = re.sub('\[.*?\]', '', words)
  words = re.sub('https?://\S+|www\.\S+', '', words)
  words = re.sub('\w*\d\w*', '', words)


### Split data into training and testing sets

In [7]:
x = df['tweet']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

### Tokenize the data

In [8]:
max_words = 50000
max_len = 300
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_matrix = pad_sequences(sequences, maxlen=max_len)

### Create the model

In [9]:
model = Sequential()
model.add(Embedding(max_words, 100, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])



### Train the model

In [10]:
history = model.fit(sequences_matrix, y_train, batch_size=128, epochs=5, validation_split=0.2)

Epoch 1/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 615ms/step - accuracy: 0.8192 - loss: 0.4353 - val_accuracy: 0.9333 - val_loss: 0.1616
Epoch 2/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 625ms/step - accuracy: 0.9406 - loss: 0.1562 - val_accuracy: 0.9489 - val_loss: 0.1244
Epoch 3/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 663ms/step - accuracy: 0.9604 - loss: 0.1080 - val_accuracy: 0.9521 - val_loss: 0.1196
Epoch 4/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 798ms/step - accuracy: 0.9720 - loss: 0.0825 - val_accuracy: 0.9516 - val_loss: 0.1227
Epoch 5/5
[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 721ms/step - accuracy: 0.9778 - loss: 0.0704 - val_accuracy: 0.9524 - val_loss: 0.1255


### Evaluate the model

In [11]:
test_sequences = tokenizer.texts_to_sequences(x_test)
test_sequences_matrix = pad_sequences(test_sequences, maxlen=max_len)
accr = model.evaluate(test_sequences_matrix, y_test)

[1m194/194[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - accuracy: 0.9436 - loss: 0.1438


### Save the model and tokenizer

In [12]:
model.save("model.h5")
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

