In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.datasets import imdb
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.python.keras.layers import Embedding
from sklearn.model_selection import train_test_split
from keras_preprocessing import sequence
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from collections import Counter
import numpy as np
import pandas as pd
# fix random seed for reproducibility
tf.random.set_seed(7)

In [7]:
df = pd.read_csv('../../data/normalized_tweets.csv')

df = df[df['cyberbullying_type'] != 'other_cyberbullying']
# Reset index after filtering out the class
df.reset_index(drop=True, inplace=True)

In [8]:
df["cyberbullying_type"].value_counts()

religion             7979
age                  7974
ethnicity            7931
gender               7639
not_cyberbullying    6868
Name: cyberbullying_type, dtype: int64

In [9]:
df['text_len'] = [len(text.split()) for text in df.tweet_text]

max_len = np.max(df['text_len'])
max_len 

181

In [10]:
def Tokenize(column, seq_len):
    ##Create vocabulary of words from column
    corpus = [word for text in column for word in text.split()]
    count_words = Counter(corpus)
    sorted_words = count_words.most_common()
    vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

    ##Tokenize the columns text using the vocabulary
    text_int = []
    for text in column:
        r = [vocab_to_int[word] for word in text.split()]
        text_int.append(r)
    ##Add padding to tokens
    features = np.zeros((len(text_int), seq_len), dtype = int)
    for i, review in enumerate(text_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[: seq_len]
        features[i, :] = np.array(new)

    return sorted_words, features

In [None]:
def Tokenize2(column, seq_len, vocab_size):
    ##Create vocabulary of words from column
    corpus = [word for text in column for word in text.split()]
    count_words = Counter(corpus)
    sorted_words = count_words.most_common(vocab_size)
    vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

    ##Tokenize the columns text using the vocabulary
    text_int = []
    for text in column:
        r = [vocab_to_int.get(word, 0) for word in text.split()]  # Use 0 for out-of-vocabulary words
        text_int.append(r)
    ##Add padding to tokens
    features = np.zeros((len(text_int), seq_len), dtype = int)
    for i, review in enumerate(text_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[: seq_len]
        features[i, :] = np.array(new)

    return sorted_words, features

# added vocab size parameter
vocabulary, tokenized_column = Tokenize2(df["tweet_text"], max_len, 1000)

In [11]:
vocabulary, tokenized_column = Tokenize(df["tweet_text"], max_len)

In [12]:
X,y = tokenized_column,df["cyberbullying_type"].values
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2)

In [13]:
# truncate and pad input sequences
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)

In [14]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform label encoder on the target variable
y_train_encoded = label_encoder.fit_transform(y_train)

In [19]:
# create the model
embedding_vecor_length = 200
num_classes = len(np.unique(y_train))
model = Sequential()
model.add(Embedding(len(vocabulary), embedding_vecor_length, input_length=max_len))
model.add(Dropout(0.5))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train_encoded, validation_split=0.2, epochs=10, batch_size=64)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 181, 200)          4684400   
_________________________________________________________________
module_wrapper_4 (ModuleWrap (None, 181, 200)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               120400    
_________________________________________________________________
module_wrapper_5 (ModuleWrap (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 202       
Total params: 4,805,002
Trainable params: 4,805,002
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10


2024-04-26 18:25:59.050802: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


## Binary classification

In [2]:
df = pd.read_csv('../../data/normalized_tweets.csv')

# if we drop 'other_cyberbullyng', n changes
df = df[df['cyberbullying_type'] != 'other_cyberbullying']
df.reset_index(drop=True, inplace=True)
n = 1953

#n = 1563

# Create an empty dataframe for the results
result_df = pd.DataFrame(columns=['tweet_text', 'label'])


for category in df['cyberbullying_type'].unique():
    if category != 'not_cyberbullying':
        sampled_df = df[df['cyberbullying_type'] == category].sample(n=n, replace=False)
        sampled_df.loc[:, 'label'] = 0  
        result_df = pd.concat([result_df, sampled_df[['tweet_text', 'label']]], ignore_index=True)
    else:
        other_cyberbullying_df = df[df['cyberbullying_type'] == category].copy() 
        other_cyberbullying_df.loc[:, 'label'] = 1 
        result_df = pd.concat([result_df, other_cyberbullying_df[['tweet_text', 'label']]], ignore_index=True)


In [3]:
df['text_len'] = [len(text.split()) for text in df.tweet_text]

max_len = np.max(df['text_len'])
max_len 

181

In [4]:
def Tokenize(column, seq_len):
    ##Create vocabulary of words from column
    corpus = [word for text in column for word in text.split()]
    count_words = Counter(corpus)
    sorted_words = count_words.most_common()
    vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

    ##Tokenize the columns text using the vocabulary
    text_int = []
    for text in column:
        r = [vocab_to_int[word] for word in text.split()]
        text_int.append(r)
    ##Add padding to tokens
    features = np.zeros((len(text_int), seq_len), dtype = int)
    for i, review in enumerate(text_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[: seq_len]
        features[i, :] = np.array(new)

    return sorted_words, features

In [5]:
vocabulary, tokenized_column = Tokenize(result_df["tweet_text"], max_len)

In [6]:
X,y = tokenized_column,result_df["label"].values
x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2)

In [7]:
# truncate and pad input sequences
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)

In [8]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform label encoder on the target variable
y_train_encoded = label_encoder.fit_transform(y_train)

In [9]:
# create the model
embedding_vecor_length = 100

model = Sequential()
model.add(Embedding(len(vocabulary), embedding_vecor_length, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train_encoded, validation_split=0.2, epochs=10, batch_size=64)

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 181, 100)          2336500   
_________________________________________________________________
module_wrapper (ModuleWrappe (None, 181, 100)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
module_wrapper_1 (ModuleWrap (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 2,417,001
Trainable params: 2,417,001
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10


2024-04-26 18:26:24.308483: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-26 18:26:24.308590: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2024-04-26 18:26:24.491852: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2024-04-26 18:26:25.220982: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2024-04-26 21:21:43.987604: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10

In [None]:
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))