In [1]:
import pandas
from nltk import word_tokenize
from gensim.models import Word2Vec

In [2]:
twitter_df = pandas.read_csv('processed_cyberbullying_tweets.csv')
tweets = twitter_df['processed_tweet_text'].astype(str).tolist()

In [3]:
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in tweets]
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [4]:
model.wv['food']

array([-0.16507632,  0.378619  ,  0.42582017,  0.09427286,  0.27047658,
       -0.616987  , -0.03579869,  0.5923699 , -0.2402382 , -0.6410478 ,
       -0.5465586 , -0.7373149 ,  0.17662947,  0.5866825 ,  0.31872606,
       -0.49406663,  0.7272476 , -1.1522293 , -0.03014559, -0.3891347 ,
        0.1614495 ,  0.48566505, -0.20668092, -0.00683245, -0.31284314,
        0.93129146, -0.33948398,  0.29610497, -0.6009659 , -0.6264338 ,
        0.38470724,  0.29574358, -0.7474082 , -0.13250948, -0.00336032,
        0.02354942,  0.5641148 , -0.04870421, -0.14749198, -0.23952669,
       -0.27287707,  0.01315786,  0.5902684 , -1.0756754 , -0.04096899,
       -0.3709021 , -0.6761695 ,  0.06842796,  0.48521775,  0.24698234,
        0.13484697, -0.18559156, -0.05258599, -0.3982558 , -0.5025895 ,
       -0.01351647,  0.8634048 ,  0.06611503, -0.17563474, -0.06008829,
       -0.22269532, -0.40564618, -0.10571008,  0.40170297, -1.109385  ,
        0.20466815,  0.30048367, -0.34155652,  0.14841117,  1.36

In [5]:
vocab_size = len(model.wv.key_to_index)
vocab_size

52456

In [6]:
twitter_df['word_embeddings'] = twitter_df.apply(lambda x : list(), axis=1)

for index, row in twitter_df.iterrows():
    for word in word_tokenize(str(row['processed_tweet_text']).lower()):
        row['word_embeddings'].append(model.wv[word])
    while len(row['word_embeddings']) < 50:
        row['word_embeddings'].append([0]*100)
    

In [7]:
len(twitter_df.loc[0, 'word_embeddings'])

50

In [8]:
len(twitter_df.loc[0, 'word_embeddings'][0])

100

In [9]:
model.wv['words']

array([ 0.01396496,  0.03674521,  0.33667824, -0.18725915,  0.05866458,
       -0.64042145,  0.18733108,  0.5918964 , -0.15871692, -0.8563191 ,
       -0.17198688, -1.1738398 , -0.00602487,  0.27594268,  0.44476044,
       -0.07357663,  0.5433722 , -1.3748239 , -0.23401612, -0.87440807,
        0.15573582,  0.34856936,  0.07077159, -0.22229286, -0.08623464,
        1.1153108 , -0.7064376 ,  0.34752527, -0.5274295 , -0.43344584,
        0.4718638 ,  0.2722596 , -0.0817119 , -0.14780602,  0.00902986,
        0.3234242 ,  0.6845311 , -0.2234101 , -0.20537469, -0.59433836,
       -0.48786896, -0.12891798,  0.64297706, -0.96116126,  0.13271472,
       -0.51004785, -0.32792   ,  0.36148915,  0.9425134 ,  0.65676916,
        0.38488388, -0.38646173,  0.07120061, -0.07105207, -0.57854754,
       -0.12074959,  0.9603054 , -0.06652799, -0.62223744, -0.21678413,
       -0.33073157, -0.16109386,  0.4395191 ,  0.81728804, -1.2463111 ,
        0.22458728,  0.40275994,  0.18307427, -0.32781166,  1.68

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X = twitter_df['word_embeddings'].tolist()
y = twitter_df['cyberbullying_type'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

In [13]:
# Define the model
bilstm = Sequential()

# Embedding layer
bilstm.add(Embedding(input_dim=52456, output_dim=100, input_length=50))

# Bidirectional LSTM layer
bilstm.add(Bidirectional(LSTM(64)))

# Additional Dense layer for more complex transformations
bilstm.add(Dense(64, activation='relu'))  # Added intermediate Dense layer with 64 units

# Output layer for binary classification
bilstm.add(Dense(1, activation='sigmoid'))

# Compile the model
bilstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary to see the updated architecture
bilstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           5245600   
                                                                 
 bidirectional (Bidirection  (None, 128)               84480     
 al)                                                             
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5338401 (20.36 MB)
Trainable params: 5338401 (20.36 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [14]:
bilstm.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

: 