In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Conv2D, MaxPooling2D, concatenate, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sn

# LOAD DATASET

In [3]:
df = pd.read_json('/Users/comfortuji/Desktop/Cyberbullying/data/train.json', lines=True)

In [4]:
df

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."
...,...,...,...,...
8495,10423,img/10423.png,1,nobody wants to hang auschwitz me
8496,98203,img/98203.png,1,when god grants you a child after 20 years of ...
8497,36947,img/36947.png,1,gays on social media: equality! body positivit...
8498,16492,img/16492.png,1,having a bad day? you could be a siamese twin ...


# TEXT DATA

In [10]:
#text data
text_data = df.drop(['img'], axis=1)

In [11]:
text_data

Unnamed: 0,id,label,text
0,42953,0,its their character not their color that matters
1,23058,0,don't be afraid to love again everyone is not ...
2,13894,0,putting bows on your pet
3,37408,0,i love everything and everybody! except for sq...
4,82403,0,"everybody loves chocolate chip cookies, even h..."
...,...,...,...
8495,10423,1,nobody wants to hang auschwitz me
8496,98203,1,when god grants you a child after 20 years of ...
8497,36947,1,gays on social media: equality! body positivit...
8498,16492,1,having a bad day? you could be a siamese twin ...


In [12]:
texts = text_data['text'].values

In [13]:
texts

array(['its their character not their color that matters',
       "don't be afraid to love again everyone is not like your ex",
       'putting bows on your pet', ...,
       'gays on social media: equality! body positivity! love love love! gays in real life: there is no place for her in our social circle',
       "having a bad day? you could be a siamese twin attached to a gay brother who has a date and you're the only one with an ass",
       'i hate muslims too they take their religion too seriously pathetic weaklings'],
      dtype=object)

In [16]:
labels = text_data['label'].values

In [17]:
labels

array([0, 0, 0, ..., 1, 1, 1])

# IMAGE DATA

In [14]:
#image data
image_data = df.drop(['text'], axis=1)

In [15]:
image_data

Unnamed: 0,id,img,label
0,42953,img/42953.png,0
1,23058,img/23058.png,0
2,13894,img/13894.png,0
3,37408,img/37408.png,0
4,82403,img/82403.png,0
...,...,...,...
8495,10423,img/10423.png,1
8496,98203,img/98203.png,1
8497,36947,img/36947.png,1
8498,16492,img/16492.png,1


In [76]:
# Filter images with labels 1
cyberbullying_image = df[df['label'] == 1]
print(cyberbullying_image)

         id            img  label  \
10    79351  img/79351.png      1   
12    25489  img/25489.png      1   
27    72640  img/72640.png      1   
30    93547  img/93547.png      1   
48    74386  img/74386.png      1   
...     ...            ...    ...   
8495  10423  img/10423.png      1   
8496  98203  img/98203.png      1   
8497  36947  img/36947.png      1   
8498  16492  img/16492.png      1   
8499  15937  img/15937.png      1   

                                                   text  
10                             jew mad? get fuhrerious!  
12     brother... a day without a blast is a day wasted  
27    is bribing muslims for liberal votes justin tr...  
30    d.j. osama spin laden droppin' beats like the ...  
48    we said we would never forget why are you voti...  
...                                                 ...  
8495                  nobody wants to hang auschwitz me  
8496  when god grants you a child after 20 years of ...  
8497  gays on social media: equal

In [77]:
# Filter images with labels 0
notcyberbullying_image = df[df['label'] == 0]
print(notcyberbullying_image)

         id            img  label  \
0     42953  img/42953.png      0   
1     23058  img/23058.png      0   
2     13894  img/13894.png      0   
3     37408  img/37408.png      0   
4     82403  img/82403.png      0   
...     ...            ...    ...   
8340   5743  img/05743.png      0   
8341  74183  img/74183.png      0   
8343  10834  img/10834.png      0   
8345  56480  img/56480.png      0   
8349  89540  img/89540.png      0   

                                                   text  
0      its their character not their color that matters  
1     don't be afraid to love again everyone is not ...  
2                              putting bows on your pet  
3     i love everything and everybody! except for sq...  
4     everybody loves chocolate chip cookies, even h...  
...                                                 ...  
8340     how your booty feels after the taco bell drops  
8341  when you hear monsanto lost its 1st round up c...  
8343  when my autistic son finds 

In [26]:
    def load_images(image_paths, target_size=(100, 100)):
        images = []
        for path in image_paths:
            img = load_img(path, target_size=target_size)
            img_array = img_to_array(img)
            images.append(img_array)
        return np.array(images)

image_paths = ["/Users/comfortuji/Desktop/Cyberbullying/data/" + img_name for img_name in image_data['img'].values]  # Replace with your image directory
images = load_images(image_paths)

In [27]:
images

array([[[[  0.,   0.,   0.],
         [  0.,   0.,   0.],
         [  0.,   0.,   0.],
         ...,
         [  0.,   0.,   0.],
         [  0.,   0.,   0.],
         [  0.,   0.,   0.]],

        [[255., 255., 255.],
         [255., 255., 255.],
         [ 44.,  44.,  35.],
         ...,
         [ 42.,  43.,  34.],
         [255., 255., 254.],
         [255., 255., 255.]],

        [[255., 255., 255.],
         [255., 255., 255.],
         [ 44.,  45.,  36.],
         ...,
         [ 43.,  43.,  35.],
         [255., 255., 254.],
         [255., 255., 255.]],

        ...,

        [[  3.,   2.,   3.],
         [  3.,   2.,   3.],
         [  3.,   2.,   3.],
         ...,
         [121.,  69.,  48.],
         [125.,  74.,  56.],
         [131.,  80.,  61.]],

        [[  2.,   2.,   2.],
         [  2.,   2.,   2.],
         [  2.,   2.,   2.],
         ...,
         [124.,  71.,  54.],
         [125.,  72.,  50.],
         [126.,  73.,  51.]],

        [[  0.,   0.,   0.],
       

# PREPROCESS THE DATASET

# TEXT

In [28]:
# Preprocess text data
max_words = 1000
max_sequence_length = 20

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

x_text = pad_sequences(sequences, maxlen=max_sequence_length)


# IMAGE

In [29]:
# Preprocess image data
x_image = images / 255.0  # Normalize pixel values to be between 0 and 1

In [30]:
# Preprocess labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
y = to_categorical(y)

# SPLIT THE DATASET 

In [31]:
# Split data into training and testing sets
x_text_train, x_text_test, x_image_train, x_image_test, y_train, y_test = train_test_split(
    x_text, x_image, y, test_size=0.2, random_state=42
)


# MODEL BUILDING

In [32]:
# Define text model (LSTM)
text_input = Input(shape=(max_sequence_length,))
embedding_layer = Embedding(max_words, 50, input_length=max_sequence_length)(text_input)
lstm_layer = LSTM(50)(embedding_layer)

In [33]:
# Define image model (CNN)
image_input = Input(shape=(100, 100, 3))
conv1 = Conv2D(32, (3, 3), activation='relu')(image_input)
pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
conv2 = Conv2D(64, (3, 3), activation='relu')(pool1)
pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
flatten = Flatten()(pool2)

# MERGE THE MODELS

In [34]:
# Concatenate text and image features
merged = concatenate([lstm_layer, flatten])

In [35]:
# Common dense layers for combined features
dense1 = Dense(64, activation='relu')(merged)
dropout = Dropout(0.5)(dense1)
output = Dense(2, activation='softmax')(dropout)

In [36]:
# Build and compile the model
model = Model(inputs=[text_input, image_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [37]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 100, 100, 3)]        0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 98, 98, 32)           896       ['input_2[0][0]']             
                                                                                                  
 max_pooling2d (MaxPooling2  (None, 49, 49, 32)           0         ['conv2d[0][0]']              
 D)                                                                                               
                                                                                                  
 input_1 (InputLayer)        [(None, 20)]                 0         []                        

In [38]:
# Train the model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit([x_text_train, x_image_train], y_train, epochs=20, batch_size=32, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.src.callbacks.History at 0x109d8ea90>

In [39]:
loss, accuracy = model.evaluate([x_text_test, x_image_test], y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

Test Loss: 0.6038, Test Accuracy: 0.6847


In [92]:
# Classification Report
print("Classification Report:\n", classification_report(y_true_labels, y_pred_labels))

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.93      0.79      1075
           1       0.69      0.26      0.38       625

    accuracy                           0.68      1700
   macro avg       0.69      0.60      0.58      1700
weighted avg       0.69      0.68      0.64      1700



# PREDICTION

In [48]:
# Trying out the model for prediction
sample_text = "Spread Love"
sample_image_path = "/Users/comfortuji/Desktop/Cyberbullying/data/img/93875.png"  

In [49]:
# Preprocess the input
sample_sequence = tokenizer.texts_to_sequences([sample_text])
sample_text_input = pad_sequences(sample_sequence, maxlen=max_sequence_length)
sample_image = load_images([sample_image_path]) / 255.0

In [50]:
# Make the prediction
prediction = model.predict([sample_text_input, sample_image])



In [51]:
# Decode the prediction
predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])
print(f"Predicted class: {predicted_class}")

Predicted class: [0]


In [52]:
# Trying out the model for prediction
sample_text = "I hate Muslims"
sample_image_path = "/Users/comfortuji/Desktop/Cyberbullying/data/img/15937.png"  

In [53]:
# Preprocess the input
sample_sequence = tokenizer.texts_to_sequences([sample_text])
sample_text_input = pad_sequences(sample_sequence, maxlen=max_sequence_length)
sample_image = load_images([sample_image_path]) / 255.0

In [54]:
# Make the prediction
prediction = model.predict([sample_text_input, sample_image])



In [55]:
# Decode the prediction
predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])
print(f"Predicted class: {predicted_class}")

Predicted class: [1]


# DEPLOY MODEL TO GRADIO FOR INTERACTIVE INTERFACE

In [80]:
# Deploy the model using Gradio
def predict_hateful_content(text, image):
    sample_image_path = "/Users/comfortuji/Desktop/Cyberbullying/data/img/15937.png"  
    # Preprocess the input
    sample_sequence = tokenizer.texts_to_sequences([text])
    sample_text_input = pad_sequences(sample_sequence, maxlen=max_sequence_length)
    sample_image = load_images([sample_image_path]) / 255.0

    # Make the prediction
    prediction = model.predict([sample_text_input, sample_image])

    # Decode the prediction
    predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])
    return f" 0 for Not cyberbullying and 1 for Cyberbullying. Detected content is labelled: {predicted_class}"


In [57]:
import gradio as gr

In [81]:
iface = gr.Interface(
    fn=predict_hateful_content,
    inputs=["text", "image"],
    outputs="text"
)
iface.launch()

Running on local URL:  http://127.0.0.1:7873

To create a public link, set `share=True` in `launch()`.




