In [27]:

#general packages for data manipulation
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


In [28]:
# Load the dataset
df = pd.read_csv('TwitterHate.csv')
df.head(10)


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
5,6,0,[2/2] huge fan fare and big talking before the...
6,7,0,@user camping tomorrow @user @user @user @use...
7,8,0,the next school year is the year for exams.ð...
8,9,0,we won!!! love the land!!! #allin #cavs #champ...
9,10,0,@user @user welcome here ! i'm it's so #gr...


In [29]:
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

def clean_text(text):
    # Remove mentions, links and hashtags
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # remove @mentions
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)  # remove URLs
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # remove hashtags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # remove non-letters
    #text = text.lower()  # convert to lowercase
    return text

# Apply text cleaning function
df['tweet'] = df['tweet'].apply(clean_text)

# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['tweet'])
sequences = tokenizer.texts_to_sequences(df['tweet'])

# Pad sequences
max_seq_length = max(len(x) for x in sequences)
X = pad_sequences(sequences, maxlen=max_seq_length)
y = df['label'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_seq_length))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()



In [32]:
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)


Epoch 1/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 140ms/step - accuracy: 0.9213 - loss: 0.2692 - val_accuracy: 0.9443 - val_loss: 0.1770
Epoch 2/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 133ms/step - accuracy: 0.9502 - loss: 0.1421 - val_accuracy: 0.9458 - val_loss: 0.1648
Epoch 3/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 134ms/step - accuracy: 0.9623 - loss: 0.1067 - val_accuracy: 0.9406 - val_loss: 0.1791
Epoch 4/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 132ms/step - accuracy: 0.9734 - loss: 0.0769 - val_accuracy: 0.9456 - val_loss: 0.1901
Epoch 5/5
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 123ms/step - accuracy: 0.9789 - loss: 0.0634 - val_accuracy: 0.9417 - val_loss: 0.2068


In [33]:
# Predict classes
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Generate classification report
report = classification_report(y_test, y_pred, target_names=['Class 0', 'Class 1'])
print(report)

[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 36ms/step
              precision    recall  f1-score   support

     Class 0       0.96      0.98      0.97      5937
     Class 1       0.63      0.48      0.54       456

    accuracy                           0.94      6393
   macro avg       0.79      0.73      0.75      6393
weighted avg       0.94      0.94      0.94      6393



In [34]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')


[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.9457 - loss: 0.1745
Test Accuracy: 0.94


In [35]:
import os
import pandas as pd
import numpy as np
import pickle
import re
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from textblob import TextBlob

In [36]:
# Save the model
model.save('lstm_model.h5')  # Saves the model to disk in HDF5 format


import pickle
# Save tokenizer
with open('lstm_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)




In [37]:
# Load the model
model = load_model('lstm_model.h5')
# Assuming the tokenizer was saved using pickle
import pickle
with open('lstm_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)




In [38]:
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)  # Remove URLs
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Remove hashtags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove non-letters
    text = text.lower()  # Convert to lowercase
    return text


In [39]:
def predict_hate_speech(text):
    # Clean and preprocess the text
    cleaned_text = clean_text(text)
    
    # Convert text to sequence of integers
    sequences = tokenizer.texts_to_sequences([cleaned_text])
    
    # Pad sequences to the same length as model's input
    max_seq_length = model.input_shape[1]  # assuming model's input shape is known
    padded_sequence = pad_sequences(sequences, maxlen=max_seq_length)
    
    # Predict using the LSTM model
    prediction = model.predict(padded_sequence)[0][0]
    
    # Return the prediction probability and class
    pred_class = 'Hate Speech' if prediction > 0.5 else 'Not Hate Speech'
    return prediction, pred_class


In [41]:
# Example text
test_text = "sweet girl"
prediction, pred_class = predict_hate_speech(test_text)

print(f'Prediction Probability: {prediction:.4f}')
print(f'Predicted Class: {pred_class}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
Prediction Probability: 0.0069
Predicted Class: Not Hate Speech
