In [1]:
import pandas as pd
from nltk import word_tokenize
from gensim.models import Word2Vec

twitter_df = pd.read_csv('processed_cyberbullying_tweets.csv')
tweets = twitter_df['processed_tweet_text'].astype(str).tolist()

tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in tweets]
model_w2v = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

model_w2v.wv['food']


vocab_size = len(model_w2v.wv.key_to_index)
vocab_size

twitter_df['word_embeddings'] = twitter_df.apply(lambda x : list(), axis=1)

for index, row in twitter_df.iterrows():
    for word in word_tokenize(str(row['processed_tweet_text']).lower()):
        row['word_embeddings'].append(model_w2v.wv[word])
    while len(row['word_embeddings']) < 50:
        row['word_embeddings'].append([0]*100)
    

len(twitter_df.loc[0, 'word_embeddings'])
len(twitter_df.loc[0, 'word_embeddings'][0])


model_w2v.wv['words']

from sklearn.model_selection import train_test_split

X = twitter_df['word_embeddings'].tolist()
y = twitter_df['cyberbullying_type'].tolist()

data_0 = twitter_df[twitter_df['cyberbullying_type'] == 0]
data_1 = twitter_df[twitter_df['cyberbullying_type'] == 1]

sampled_data_0 = data_0.sample(n=7000, random_state=42)
sampled_data_1 = data_1.sample(n=20000, random_state=42)

balanced_df = pd.concat([sampled_data_0, sampled_data_1])

X_dec = balanced_df['word_embeddings'].tolist()
y_dec = balanced_df['cyberbullying_type'].tolist()

# Shuffle the resulting DataFrame to mix the examples of 0s and 1s
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(X_dec, y_dec, test_size=0.3, random_state=42)



In [2]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming X_train, X_test are lists of lists of word embeddings, and y_train, y_test are labels
# Since the input data is already in the form of embeddings, we don't need an embedding layer


In [3]:

# Padding sequences to ensure uniform input size
max_length = max(max(len(seq) for seq in X_train), max(len(seq) for seq in X_test))
X_train_padded = pad_sequences(X_train, maxlen=max_length, padding='post', dtype='float32')
X_test_padded = pad_sequences(X_test, maxlen=max_length, padding='post', dtype='float32')




In [4]:

model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True), input_shape=(max_length, 100)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))  # Use 'softmax' for multi-class classification

# Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Use 'categorical_crossentropy' for multi-class classification

# Training the model
model.fit(X_train_padded, np.array(y_train), epochs=5, batch_size=64, validation_split=0.2)

# Evaluating the model
loss, accuracy = model.evaluate(X_test_padded, np.array(y_test))
print(f'Test Accuracy: {accuracy}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.8239505887031555


In [16]:
def predict_sentiment(sentence, model, word2vec_model):
    # Tokenize and convert to embeddings
    tokens = word_tokenize(sentence.lower())
    print(tokens)
    embeddings = [word2vec_model.wv[token] if token in word2vec_model.wv else np.zeros(100) for token in tokens]
    
    # Pad the sequence
    padded_embeddings = pad_sequences([embeddings], maxlen=max_length, padding='post', dtype='float32')
    print(padded_embeddings)
    # Predict
    prediction = model.predict(padded_embeddings)
    print(prediction)
    # Assuming binary classification with a sigmoid output layer
    predicted_label = 'Positive' if prediction[0][0] < 0.5 else 'Negative'
    
    return predicted_label, prediction[0][0]

# Example usage
test_sentence = "You are fucking ugly as fuck you motherfucker"
predicted_label, confidence = predict_sentiment(test_sentence, model, model_w2v)  # 'model' refers to your trained LSTM model, and 'model' is your Word2Vec model

print(f"Sentence: '{test_sentence}'")
print(f"Predicted Sentiment: {predicted_label} (Confidence: {confidence:.2f})")

['you', 'are', 'fucking', 'ugly', 'as', 'fuck', 'you', 'motherfucker']
[[[ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.34628174 -0.38973066  0.37605566 ... -1.5988     -0.1268468
    0.05593154]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]]
[[0.96600205]]
Sentence: 'You are fucking ugly as fuck you motherfucker'
Predicted Sentiment: Negative (Confidence: 0.97)
