In [3]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import re
import string

# Function to clean the text data
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

# Load the dataset
file_path = './archive/train.csv'  # Update with the correct file path
data = pd.read_csv(file_path)

# Clean the tweets
data['tweet'] = data['tweet'].apply(lambda x: clean_text(x))

# Preparing the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['tweet'])

# Convert the text to sequences
X = tokenizer.texts_to_sequences(data['tweet'])

# Pad the sequences
X = pad_sequences(X, maxlen=50)

data['offensive_language_count_normalized'] = data['offensive_language_count'] / data['count']
data['offensive_language_count_normalized'] = data['offensive_language_count_normalized'].apply(lambda x: 1 if x >= 0.9 else 0)

# Prepare the target variable
y = data['offensive_language_count_normalized'].values

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model parameters
vocab_size = len(tokenizer.word_index) + 1  # Plus 1 for padding token
embed_size = 16

# Building the model
model = Sequential()
model.add(Embedding(vocab_size, embed_size, input_length=50))
model.add(LSTM(60, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(60))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # 'sigmoid' for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
checkpoint = ModelCheckpoint('./models/model-1.h5', monitor='val_accuracy', save_best_only=True)

# Train the model with class weights
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=64, class_weight=class_weights, callbacks=[early_stopping, checkpoint])


Epoch 1/5
Epoch 2/5
  2/310 [..............................] - ETA: 18s - loss: 0.3987 - accuracy: 0.8438

  saving_api.save_model(


Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f427ad6fb50>

In [5]:
def predict_hateness(message):
    # Clean and preprocess the message
    cleaned_message = clean_text(message)
    sequence = tokenizer.texts_to_sequences([cleaned_message])
    padded_sequence = pad_sequences(sequence, maxlen=50)

    # Predict
    prediction = model.predict(padded_sequence, verbose=0)
    return prediction[0][0]*100

In [16]:
test = [
    "This restaurant is not worth your time or money.",
    "Your work has been consistently outstanding.",
    "He has a very controversial opinion on that.",
    "I'm offended by your generalizations about people.",
    "The team's performance was embarrassingly poor this season.",
    "Your statement is ignorant and deeply offensive.",
    "It's irresponsible to spread such misinformation.",
    "The book was a groundbreaking piece of literature.",
    "Their approach to problem-solving is quite innovative.",
    "That comment was uncalled for and hurtful.",
    "Your dedication to this project is truly inspiring.",
    "I disagree with your methods and results.",
    "The movie's plot was predictable and unoriginal.",
    "Your assistance has been invaluable to our success.",
    "Their political stance is quite extreme and divisive.",
    "I've never experienced such terrible customer service.",
    "Your creativity and vision are truly exceptional.",
    "That joke was in poor taste and offensive.",
    "The way they handle issues is highly professional.",
    "I find your lack of empathy disturbing.",
    "This new policy could be detrimental to our progress.",
    "Your leadership has brought us to new heights.",
    "Such behavior is unacceptable and disappointing.",
    "Their perspective on this matter is refreshingly honest.",
    "I can't believe how narrow-minded that statement was.",
    "The quality of this product exceeded my expectations.",
    "That speech was inflammatory and reckless.",
    "Your contribution to the field is groundbreaking.",
    "They have a tendency to make baseless accusations.",
    "Her insight into the issue was incredibly enlightening.",
    "That remark was sexist and inappropriate.",
    "Your performance has consistently improved.",
    "Their lack of responsibility in this matter is alarming.",
    "I've never seen such a well-executed plan.",
    "That's an outdated and prejudiced viewpoint.",
    "Your innovative approach is exactly what we needed.",
    "His comments were racially insensitive.",
    "The service at this hotel is top-notch.",
    "Such ignorance about the topic is astounding.",
    "You have a talent for making complex topics accessible.",
    "Their accusations were baseless and harmful.",
    "The artwork you created was deeply moving.",
    "That kind of language is unacceptable and harmful.",
    "Your understanding of the subject is impressive.",
    "Their stance on the issue is misguided and naive.",
    "The attention to detail in this project is remarkable.",
    "Your statement was biased and unfounded.",
    "The innovation in this design is revolutionary.",
    "That kind of behavior should not be tolerated.",
    "You have an exceptional ability to motivate people.",
    "Their argument is flawed and poorly constructed.",
    "The craftsmanship of this product is exceptional.",
    "Such a disrespectful comment is unacceptable.",
    "Your ability to adapt to changes is commendable.",
    "They have a habit of spreading rumors.",
    "This painting is a masterpiece of modern art.",
    "That approach is overly simplistic and ineffective.",
    "Your problem-solving skills are extraordinary.",
    "Their remarks were homophobic and unacceptable.",
    "The level of skill in your work is astounding.",
    "That sort of attitude is counterproductive and harmful.",
    "Your vision for the project is innovative and exciting.",
    "They often make discriminatory remarks.",
    "This novel is a brilliant exploration of complex themes.",
    "That methodology is outdated and inefficient.",
    "Your insights have been crucial to our understanding.",
    "Their commentary is often sexist and degrading.",
    "The elegance of this design is unparalleled.",
    "That kind of talk is divisive and unhelpful.",
    "You have an impressive knack for leadership.",
    "They display a concerning lack of judgment.",
    "This sculpture is a stunning representation of abstract art.",
    "That opinion is short-sighted and uneducated.",
    "Your strategic planning has been a game-changer.",
    "Their views are often extremist and polarizing.",
    "The complexity of this issue has been well-handled.",
    "That statement is a gross oversimplification.",
    "You have a profound understanding of the topic.",
    "They frequently engage in unethical practices.",
    "This film is a remarkable achievement in cinema.",
    "That assumption is both arrogant and incorrect.",
    "Your ability to overcome obstacles is inspiring.",
    "They tend to be dismissive of important issues.",
    "The texture and color in this artwork are extraordinary.",
    "That kind of statement is fear-mongering.",
    "You've shown remarkable growth and development.",
    "Their handling of the situation was inept and careless.",
    "This song is an anthem for a generation.",
    "That view is narrow-minded and regressive.",
    "Your resilience in the face of adversity is admirable.",
    "They often resort to personal attacks.",
    "The innovation in this technology is groundbreaking.",
    "That kind of rhetoric is dangerous and misleading.",
    "You have a unique ability to connect with people.",
    "Their interpretation of the data is misleading.",
    "This artwork is a vivid portrayal of contemporary issues.",
    "That strategy is unethical and irresponsible.",
    "Your work ethic is unmatched and commendable.",
    "They frequently display a lack of empathy.",
    "This piece is a thought-provoking exploration of identity."
    ]

In [17]:
for t in test:
    print(f"\"{t}\":\t{predict_hateness(t):2.2f} ")

"This restaurant is not worth your time or money.":	87.71 
"Your work has been consistently outstanding.":	6.14 
"He has a very controversial opinion on that.":	8.94 
"I'm offended by your generalizations about people.":	97.97 
"The team's performance was embarrassingly poor this season.":	25.23 
"Your statement is ignorant and deeply offensive.":	0.24 
"It's irresponsible to spread such misinformation.":	97.30 
"The book was a groundbreaking piece of literature.":	89.35 
"Their approach to problem-solving is quite innovative.":	6.13 
"That comment was uncalled for and hurtful.":	66.97 
"Your dedication to this project is truly inspiring.":	96.03 
"I disagree with your methods and results.":	13.59 
"The movie's plot was predictable and unoriginal.":	95.26 
"Your assistance has been invaluable to our success.":	78.13 
"Their political stance is quite extreme and divisive.":	0.13 
"I've never experienced such terrible customer service.":	96.31 
"Your creativity and vision are truly excep