In [25]:
import tensorflow as tf
from tensorflow import keras
from scipy.stats import uniform
from keras.models import Sequential
from keras.layers import Embedding, Dropout, LSTM, Dense, Attention, Bidirectional
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from scikeras.wrappers import KerasClassifier
from keras import regularizers
from sklearn.model_selection import  RandomizedSearchCV
from collections import Counter
from sklearn.metrics import make_scorer, f1_score
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer

# define attention layer
from keras.layers import Layer
from keras import backend as K

class AttentionLayer(Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.W = self.add_weight(name='attention_weights', 
                                 shape=(input_shape[-1], 1),
                                 initializer='uniform',
                                 trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        # Compute attention scores
        attention_scores = K.dot(x, self.W)
        attention_scores = K.squeeze(attention_scores, axis=-1)
        attention_weights = K.softmax(attention_scores)

        # Apply attention weights
        weighted_input = x * K.expand_dims(attention_weights)

        # Sum over timesteps
        context_vector = K.sum(weighted_input, axis=1)

        return context_vector

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

# Define early stopping
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=4)

#f1_scorer = make_scorer(f1_score, average='micro')

# define the model
def create_lstm_model(units, vocab_length, embedding_matrix, max_len, dropout):
    lstm_model = Sequential()
    embedding_layer = Embedding(vocab_length, 200, weights=[embedding_matrix], input_length=max_len, trainable=False)
    lstm_model.add(embedding_layer)
    lstm_model.add(Dropout(dropout))
    lstm_model.add(Bidirectional(LSTM(units, return_sequences=True)))
    lstm_model.add(AttentionLayer())
    lstm_model.add(Dense(5, activation='softmax', kernel_initializer='ones'))
    lstm_model.compile(optimizer='adam', loss="sparse_categorical_crossentropy")
    return lstm_model


# fix random seed for reproducibility
#tf.random.set_seed(7)

df = pd.read_csv('../../data/normalized_tweets.csv')
df = df[df['cyberbullying_type'] != 'other_cyberbullying']
# Reset index after filtering out the class
df.reset_index(drop=True, inplace=True)
#df["cyberbullying_type"].value_counts()

### try different length based on tweet lentgh
df['text_len'] = [len(text.split()) for text in df.tweet_text]
#max_len = np.max(df['text_len'])
#avg_len = np.mean(df['text_len'])
#avg_len = int(avg_len)
#print(avg_len)
avg_len = 35
# checks on tweets length
count = (df['text_len'] >= 35).sum()
print("Number of values greater than or equal to 35:", count)


X, y = df["tweet_text"], df["cyberbullying_type"]

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform label encoder on the target variable
y = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Each word in input used as a key, while a unique index is used as the value of the key 
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(x_train)

X_train = word_tokenizer.texts_to_sequences(x_train)
X_test = word_tokenizer.texts_to_sequences(x_test)

vocab_length = len(word_tokenizer.word_index) + 1

print(vocab_length)

####

X_train = pad_sequences(X_train, padding = 'pre', maxlen = avg_len)
X_test = pad_sequences(X_test, padding = 'pre', maxlen = avg_len)

# Load GloVe word embeddings and create a dictionary that willl contain words as keys, and their corresponging embedding list as values. 
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('../../glove_embeddings/glove.twitter.27B.200d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_length, 200)) ## change if the dimention of embedding changes above
i = 0
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
    else:
        i = i + 1
print(i)

ImportError: cannot import name 'F1Score' from 'sklearn.metrics' (/opt/anaconda3/envs/tf_env4/lib/python3.8/site-packages/sklearn/metrics/__init__.py)

In [None]:
X_train[1]

array([    0,     0,     0,     0,     0,     0,  5109,  1040,   625,
          80,   385,  3475,  1540,  1119,  1362,    33,   170,    94,
          46,  2691,  2589,    33,   110,  2090,  1441,   928,  1541,
        3943,   957, 13338,   110,    94,    14,   102,   302],
      dtype=int32)

In [None]:
#import warnings
#warnings.filterwarnings("ignore")

model = KerasClassifier(model=create_lstm_model, units=256, batch_size=64, dropout=0.2, validation_split=0.2,optimizer__learning_rate=0.1, vocab_length=vocab_length, embedding_matrix=embedding_matrix, max_len=avg_len)

# Define the grid search parameters
param_grid = dict(optimizer__learning_rate=[0.001, 0.01, 0.1],
                dropout=[0.1, 0.5], epochs=[20],
                units=[64, 128], batch_size=[32, 64])

# Perform grid search
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid,refit=True,cv=3)
grid_result = grid.fit(X_train, y_train, callbacks=[early_stopping])

ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/tf_env4/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/tf_env4/lib/python3.8/site-packages/scikeras/wrappers.py", line 1491, in fit
    super().fit(X=X, y=y, sample_weight=sample_weight, **kwargs)
  File "/opt/anaconda3/envs/tf_env4/lib/python3.8/site-packages/scikeras/wrappers.py", line 760, in fit
    self._fit(
  File "/opt/anaconda3/envs/tf_env4/lib/python3.8/site-packages/scikeras/wrappers.py", line 915, in _fit
    X, y = self._initialize(X, y)
  File "/opt/anaconda3/envs/tf_env4/lib/python3.8/site-packages/scikeras/wrappers.py", line 852, in _initialize
    self.model_ = self._build_keras_model()
  File "/opt/anaconda3/envs/tf_env4/lib/python3.8/site-packages/scikeras/wrappers.py", line 429, in _build_keras_model
    model = final_build_fn(**build_params)
  File "/var/folders/tz/9k5bcs_502z2svd2fr_k2w680000gn/T/ipykernel_927/4019297405.py", line 66, in create_lstm_model
    lstm_model.compile(optimizer='adam', loss="sparse_categorical_crossentropy", scoring="f1")
  File "/opt/anaconda3/envs/tf_env4/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/opt/anaconda3/envs/tf_env4/lib/python3.8/site-packages/keras/src/engine/training.py", line 3787, in _validate_compile
    raise TypeError(
TypeError: Invalid keyword argument(s) in `compile()`: ({'scoring'},). Valid keyword arguments include "cloning", "experimental_run_tf_function", "distribute", "target_tensors", or "sample_weight_mode".


In [None]:
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#means = grid_result.cv_results_['mean_test_score']
#stds = grid_result.cv_results_['std_test_score']
#params = grid_result.cv_results_['params']
#for mean, stdev, param in zip(means, stds, params):
    #print("%f (%f) wit h: %r" % (mean, stdev, param))

Best: 0.932100 using {'units': 128, 'optimizer__learning_rate': 0.005, 'epochs': 6, 'dropout': 0.5, 'batch_size': 16}


In [None]:
#from joblib import dump, load

# save model
estimator = grid_result.best_estimator_
#dump(estimator, "model_093.joblib")
# Somewhere else
#estimator = load("your-model.joblib")
estimator