In [78]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import string

# Function to clean the text data
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

# Load the dataset
file_path = './archive/train.csv'  # Update with the correct file path
data = pd.read_csv(file_path)

# Clean the tweets
data['tweet'] = data['tweet'].apply(lambda x: clean_text(x))

# Preparing the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['tweet'])

# Convert the text to sequences
X = tokenizer.texts_to_sequences(data['tweet'])

# Pad the sequences
X = pad_sequences(X, maxlen=50)

data['offensive_language_count_normalized'] = data['offensive_language_count'] / data['count']
data['offensive_language_count_normalized'] = data['offensive_language_count_normalized'].apply(lambda x: 1 if x >= 0.9 else 0)

# Prepare the target variable
y = data['offensive_language_count_normalized'].values

In [79]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model parameters
vocab_size = len(tokenizer.word_index) + 1  # Plus 1 for padding token
embed_size = 128

# Building the model
model = Sequential()
model.add(Embedding(vocab_size, embed_size, input_length=50))
model.add(LSTM(60, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(60))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # 'sigmoid' for binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights))

# Train the model with class weights
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=64, class_weight=class_weights)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7f136ff06470>

In [80]:
def predict_hateness(message):
    # Clean and preprocess the message
    cleaned_message = clean_text(message)
    sequence = tokenizer.texts_to_sequences([cleaned_message])
    padded_sequence = pad_sequences(sequence, maxlen=50)

    # Predict
    prediction = model.predict(padded_sequence, verbose=0)
    return prediction[0][0]*100

Prediction: 32.776 %


In [89]:
test = [
    "",
    "miaou",
    "hello",
    "I'm not your friend",
    "I'm your friend",
    "Don't stop",
    "You're a poop",
    "You're a big big poop",
    "just go",
    "lmao",
    "ig",
    "tv",
    "fuck you",
    "love you",
    "thank you",
    "snorkel",
    "phillipins",
    "dog",
    "cat",
    "just finish it"
]

In [88]:
for t in test:
    print(f"\"{t}\":\t{predict_hateness(t):2.2f} %")

"":	32.78 %
"miaou":	32.78 %
"hello":	11.01 %
"I'm not your friend":	23.46 %
"I'm your friend":	10.13 %
"Don't stop":	81.62 %
"You're a poop":	1.39 %
"You're a big big poop":	8.40 %
"just go":	64.71 %
"lmao":	33.10 %
"ig":	12.20 %
"tv":	40.81 %
"fuck you":	45.10 %
"love you":	9.68 %
"snorkel":	32.78 %
"phillipins":	32.78 %
"dog":	23.77 %
"cat":	77.14 %
"just finish it":	33.13 %
