In [42]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [43]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [29]:
df = pd.read_csv("https://raw.githubusercontent.com/Youssef3082004/Tawasul/nlp/Cleaned_Emotions_2.csv")
df

Unnamed: 0,CleanedText,Emotion
0,feel really helpless heavy hearted,Fear
1,enjoy able slouch relax unwind frankly need la...,Sadness
2,give internship dmrg feel distraught,Fear
3,know feel lose,Sadness
4,kindergarten teacher thoroughly weary job take...,Fear
...,...,...
416103,feel like tell horny devil find site suit sort...,Love
416104,begin realize feel agitate restless would thin...,Anger
416105,feel curious previous early dawn time seek tro...,Surprise
416106,feel becuase tyranical nature government el sa...,Anger


In [30]:
max_len = len(df["CleanedText"].max().split(" "))
print(f"Longest Sentence in the dataset is has lenght = {max_len} word")

Longest Sentence in the dataset is has lenght = 11 word


In [31]:
X = df["CleanedText"]
Y = df["Emotion"]
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [32]:
VOCAB_SIZE = 20000
MAX_LEN = 100
NUM_CLASSES = 6

In [33]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [34]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [35]:
X_train_padded = pad_sequences(X_train_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=MAX_LEN, padding='post', truncating='post')


In [37]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_labels = label_encoder.fit_transform(y_train)
y_test_labels = label_encoder.transform(y_test)

NUM_CLASSES = len(label_encoder.classes_)

In [None]:
y_train_encoded = to_categorical(y_train_labels, num_classes=NUM_CLASSES)
y_test_encoded = to_categorical(y_test_labels, num_classes=NUM_CLASSES)

In [41]:
weights = class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(y_train),y=y_train.values)
class_weights = dict(enumerate(weights))
print(class_weights)

{0: np.float64(1.2110053695376959), 1: np.float64(1.4563471230575389), 2: np.float64(0.4928183764289965), 3: np.float64(2.002851882603516), 4: np.float64(0.5731389847317204), 5: np.float64(4.664620817218766)}


In [45]:
VOCAB_SIZE = 20000
MAX_LEN = 100
NUM_CLASSES = 6

model = Sequential([
    Embedding(VOCAB_SIZE, 256),
    Bidirectional(LSTM(128, return_sequences=True, use_bias=True)),
    Dropout(0.5),
    Bidirectional(LSTM(64, use_bias=True)),
    Dropout(0.5),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])


early_stopping = EarlyStopping(monitor='val_loss', patience = 6, restore_best_weights=True)

In [None]:
history = model.fit(X_train_padded,y_train_encoded,epochs=20,batch_size=64,validation_data=(X_test_padded, y_test_encoded),callbacks=[early_stopping],class_weight=class_weights)

Epoch 1/20
[1m 320/5202[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:14:59[0m 922ms/step - accuracy: 0.3281 - loss: 1.4813

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)