In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import re
import tensorflow as tf
from keras.layers import LSTM , Bidirectional , Embedding, Dense , Input
from keras.preprocessing.text import Tokenizer
from keras.models import Model

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Awais\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Awais\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!





In [2]:
df=pd.read_csv('data\cyberbullying_tweets.csv')

In [3]:
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [4]:
df.columns= ['text','target']

In [5]:
df.head()

Unnamed: 0,text,target
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [6]:
df['target'].unique()

array(['not_cyberbullying', 'gender', 'religion', 'other_cyberbullying',
       'age', 'ethnicity'], dtype=object)

In [7]:
df.shape

(47692, 2)

In [8]:
df.isna().sum()

text      0
target    0
dtype: int64

In [9]:
df.duplicated().sum()

36

In [10]:
df.drop_duplicates(inplace=True)

# Text Preprocessing

In [11]:
stemmer=PorterStemmer()

In [12]:
from nltk.tokenize import word_tokenize , sent_tokenize

In [13]:
def preprocess(text):
    text = text.lower()
    text = word_tokenize(text)
    text = [word for word in text if not re.match(r'#\w+', word)]
    translator = str.maketrans('', '', string.punctuation)
    text = [word.translate(translator) for word in text]
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    text=[stemmer.stem(word) for word in text]
    cleaned_text = " ".join(text)
    return cleaned_text


In [14]:
preprocess(df['text'][1])

' aussietv white   mkr  theblock  imacelebrityau  today  sunris  studio10  neighbour  wonderlandten  etc'

In [15]:
df['clean_text']=df['text'].apply(preprocess)

In [16]:
df.head()

Unnamed: 0,text,target,clean_text
0,"In other words #katandandre, your food was cra...",not_cyberbullying,word katandandr food crapilici mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying,aussietv white mkr theblock imacelebritya...
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying,xochitlsuckkk classi whore red velvet cupcak
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying,jasongio meh p thank head concern anoth an...
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying,rudhoeenglish isi account pretend kurdish acc...


# Encoding the target categories

In [17]:
for i,value in enumerate(df['target'].unique()): 
    df['target'].replace({value:i},inplace=True)

In [18]:
df.head()

Unnamed: 0,text,target,clean_text
0,"In other words #katandandre, your food was cra...",0,word katandandr food crapilici mkr
1,Why is #aussietv so white? #MKR #theblock #ImA...,0,aussietv white mkr theblock imacelebritya...
2,@XochitlSuckkks a classy whore? Or more red ve...,0,xochitlsuckkk classi whore red velvet cupcak
3,"@Jason_Gio meh. :P thanks for the heads up, b...",0,jasongio meh p thank head concern anoth an...
4,@RudhoeEnglish This is an ISIS account pretend...,0,rudhoeenglish isi account pretend kurdish acc...


# Tokenizing the text

In [19]:
tokenizer=Tokenizer()

In [20]:
tokenizer.fit_on_texts(df['clean_text'].values)

In [21]:
sequences = tokenizer.texts_to_sequences(df['clean_text'].values)

In [22]:
len(tokenizer.word_index)

53268

In [23]:
sequences[:2] # Preview of sequences

[[99, 2158, 638, 18494, 35],
 [18495, 36, 35, 12413, 7350, 168, 12414, 18496, 6015, 18497, 236]]

# Padding the sequences

In [24]:
from keras.preprocessing.sequence import pad_sequences

In [25]:
max_len=40

In [26]:
padded_sequences=pad_sequences(sequences, maxlen=max_len, padding='post')

In [27]:
padded_sequences.shape

(47656, 40)

In [28]:
from keras.utils import to_categorical

In [29]:
x=padded_sequences
y=to_categorical(df['target'].values)

In [30]:
print(f'x has shape: {x.shape} and y has shape :{y.shape}')

x has shape: (47656, 40) and y has shape :(47656, 6)


In [31]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test =  train_test_split(x, y , test_size=0.1, random_state=42)

In [33]:
print(f'Traning has samples: {x_train.shape[0]} , Testing has samples: {x_test.shape[0]}')

Traning has samples: 42890 , Testing has samples: 4766


# Model Preparation

In [34]:
def build_model(embedding_dim , latent_dim , vocab_size , max_len, target_values):
    inp=Input(max_len, )
    emb=Embedding(vocab_size, embedding_dim , input_length=max_len)(inp)
    lstm_1=Bidirectional(LSTM(latent_dim, return_sequences=True))(emb)
    lstm_2=Bidirectional(LSTM(latent_dim))(lstm_1)
    out=Dense(target_values, activation='softmax')(lstm_2)
    model=Model(inp , out)
    return model

In [35]:
!pip install wandb



In [36]:
import wandb
wandb.login(key="ba7560e300da0402f0be06a309b33c370c62acfe")
wandb.init(project='Cyberbullying_Classification_v2')

[34m[1mwandb[0m: Currently logged in as: [33mawaistahseen009[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Awais\.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

In [37]:
wandb.config={
'batch_size':256,
'epochs':30,
'latent_dim':150,
'embedding_dim':300,
'optimizer':'rmsprop',
}
configuration=wandb.config
from wandb.keras import WandbCallback

In [38]:
vocab_size=len(tokenizer.word_index)
target_values=len(df['target'].unique())

In [39]:
target_values

6

In [40]:
model=build_model(configuration['embedding_dim'], configuration['latent_dim'], vocab_size+1 ,max_len, target_values )




In [41]:
model.compile(optimizer=configuration['optimizer'], loss='categorical_crossentropy' , metrics=['accuracy'] )




In [42]:
from tensorflow.keras.callbacks import EarlyStopping
early_s = EarlyStopping(monitor='val_loss',verbose=1,patience=10)

In [43]:
model.fit(x_train, y_train,
          epochs=configuration['epochs'] , 
          batch_size=configuration['batch_size'],
          validation_split=0.2,
          callbacks=[early_s, WandbCallback()]
         )



Epoch 1/30



  saving_api.save_model(


INFO:tensorflow:Assets written to: C:\Users\Awais\Desktop\Machine_learning_ETEP\Cyberbullying_Classification\cyber-bullying-classification\wandb\run-20240204_124259-5vfpyhdr\files\model-best\assets


INFO:tensorflow:Assets written to: C:\Users\Awais\Desktop\Machine_learning_ETEP\Cyberbullying_Classification\cyber-bullying-classification\wandb\run-20240204_124259-5vfpyhdr\files\model-best\assets
[34m[1mwandb[0m: Adding directory to artifact (C:\Users\Awais\Desktop\Machine_learning_ETEP\Cyberbullying_Classification\cyber-bullying-classification\wandb\run-20240204_124259-5vfpyhdr\files\model-best)... Done. 0.6s


Epoch 2/30

  saving_api.save_model(


INFO:tensorflow:Assets written to: C:\Users\Awais\Desktop\Machine_learning_ETEP\Cyberbullying_Classification\cyber-bullying-classification\wandb\run-20240204_124259-5vfpyhdr\files\model-best\assets


INFO:tensorflow:Assets written to: C:\Users\Awais\Desktop\Machine_learning_ETEP\Cyberbullying_Classification\cyber-bullying-classification\wandb\run-20240204_124259-5vfpyhdr\files\model-best\assets
[34m[1mwandb[0m: Adding directory to artifact (C:\Users\Awais\Desktop\Machine_learning_ETEP\Cyberbullying_Classification\cyber-bullying-classification\wandb\run-20240204_124259-5vfpyhdr\files\model-best)... Done. 0.4s


Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 12: early stopping


<keras.src.callbacks.History at 0x2d79a0aa050>

In [44]:
evaluation_result = model.evaluate(x_test, y_test)

# Printing the evaluation results
print("Evaluation Loss:", evaluation_result[0])
print("Evaluation Accuracy:", evaluation_result[1])

Evaluation Loss: 0.6704307794570923
Evaluation Accuracy: 0.8168275356292725


In [45]:
model.save('cyber_model.keras')

In [46]:
wandb.finish()

VBox(children=(Label(value='43.612 MB of 398.380 MB uploaded\r'), FloatProgress(value=0.10947414705059802, max…

0,1
accuracy,▁▅▆▇▇▇██████
epoch,▁▂▂▃▄▄▅▅▆▇▇█
loss,█▄▃▃▂▂▂▁▁▁▁▁
val_accuracy,▁▆▅▅█▇▅▆▄▅▃▇
val_loss,▅▁▃▃▂▄▄▄▇▄█▆

0,1
accuracy,0.95255
best_epoch,1.0
best_val_loss,0.43506
epoch,11.0
loss,0.11034
val_accuracy,0.8101
val_loss,0.71156
