# GloVe

In [None]:
# Glove

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Attention, GlobalAveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_csv('english_only.csv') 

TEST_SIZE = 0.15
VALIDATE_SIZE = 0.1765
RANDOM_STATE_INT = 14988828
batch_size = 32
epochs = 10  

max_len = 128
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(df['excerpt_value_cleaned'])
sequences = tokenizer.texts_to_sequences(df['excerpt_value_cleaned'])
padded_sequences = pad_sequences(sequences, maxlen=max_len)

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['plutchik_emotion'])

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE_INT)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VALIDATE_SIZE, random_state=RANDOM_STATE_INT)

def load_glove_embeddings(glove_file_path, embedding_dim, tokenizer_word_index):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(tokenizer_word_index) + 1, embedding_dim))
    for word, i in tokenizer_word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

embedding_dim = 300
glove_file_path = 'glove.6B.300d.txt'
embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index)

dropout_rate = 0.5

input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_layer)
bi_lstm = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)

attention = Attention()([bi_lstm, bi_lstm])
context_vector = GlobalAveragePooling1D()(attention)

dropout_layer = Dropout(dropout_rate)(context_vector)
dense_layer_1 = Dense(64, activation='relu')(dropout_layer)
dropout_layer_2 = Dropout(dropout_rate)(dense_layer_1)
output_layer = Dense(8, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=epochs, callbacks=[early_stopping])

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
f1_w = f1_score(y_test, y_pred_classes, average='weighted')
recall_w = recall_score(y_test, y_pred_classes, average='weighted')
precision_w = precision_score(y_test, y_pred_classes, average='weighted')
f1_m = f1_score(y_test, y_pred_classes, average='macro')
recall_m = recall_score(y_test, y_pred_classes, average='macro')
precision_m = precision_score(y_test, y_pred_classes, average='macro')

print("Accuracy:", accuracy)
print("F1 Score (Weighted):", f1_w)
print("F1 Score (Macro):", f1_m)
print("Recall (Weighted):", recall_w)
print("Recall (Macro):", recall_m)
print("Precision (Weighted):", precision_w)
print("Precision (Macro):", precision_m)

cm = confusion_matrix(y_test, y_pred_classes)
print(cm)

emotion_categories = label_encoder.classes_

plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=emotion_categories, yticklabels=emotion_categories)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# HistWords

In [None]:
# HistWords

import pandas as pd
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Attention, GlobalAveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_csv('english_only.csv')

TEST_SIZE = 0.15
VALIDATE_SIZE = 0.1765
RANDOM_STATE_INT = 14988828
batch_size = 32
epochs = 20

max_len = 128
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(df['excerpt_value_cleaned'])
sequences = tokenizer.texts_to_sequences(df['excerpt_value_cleaned'])
padded_sequences = pad_sequences(sequences, maxlen=max_len)

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['plutchik_emotion'])

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE_INT)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VALIDATE_SIZE, random_state=RANDOM_STATE_INT)

def load_multiple_histwords_embeddings(embedding_files_path):
    embedding_index = {}
    for file in os.listdir(embedding_files_path):
        if file.endswith('-vocab.pkl'):
            decade = file.split('-vocab.pkl')[0]
            vocab_path = os.path.join(embedding_files_path, file)
            vectors_path = os.path.join(embedding_files_path, f"{decade}-w.npy")

            if not os.path.exists(vectors_path):
                print(f"File {vectors_path} not found, skipping.")
                continue

            with open(vocab_path, 'rb') as f:
                vocab = pickle.load(f)

            word_vectors = np.load(vectors_path)

            for i, word in enumerate(vocab):
                if word not in embedding_index:
                    embedding_index[word] = word_vectors[i]

    return embedding_index

embedding_dim = 300
embedding_files_path = 'sgns/' 
embedding_index = load_multiple_histwords_embeddings(embedding_files_path)

embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

dropout_rate = 0.5

input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_layer)
bi_lstm = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)

attention = Attention()([bi_lstm, bi_lstm])
context_vector = GlobalAveragePooling1D()(attention)

dropout_layer = Dropout(dropout_rate)(context_vector)
dense_layer_1 = Dense(64, activation='relu')(dropout_layer)
dropout_layer_2 = Dropout(dropout_rate)(dense_layer_1)
output_layer = Dense(8, activation='softmax')(dropout_layer_2)

model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=epochs, callbacks=[early_stopping])

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
f1_w = f1_score(y_test, y_pred_classes, average='weighted')
recall_w = recall_score(y_test, y_pred_classes, average='weighted')
precision_w = precision_score(y_test, y_pred_classes, average='weighted')
f1_m = f1_score(y_test, y_pred_classes, average='macro')
recall_m = recall_score(y_test, y_pred_classes, average='macro')
precision_m = precision_score(y_test, y_pred_classes, average='macro')

print("Accuracy:", accuracy)
print("F1 Score (Weighted):", f1_w)
print("F1 Score (Macro):", f1_m)
print("Recall (Weighted):", recall_w)
print("Recall (Macro):", recall_m)
print("Precision (Weighted):", precision_w)
print("Precision (Macro):", precision_m)

cm = confusion_matrix(y_test, y_pred_classes)
print(cm)

emotion_categories = label_encoder.classes_

plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=emotion_categories, yticklabels=emotion_categories)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


# FastText

In [None]:
# FastText

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Attention, GlobalAveragePooling1D
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_csv('english_only.csv')

TEST_SIZE = 0.15
VALIDATE_SIZE = 0.1765
RANDOM_STATE_INT = 14988828
batch_size = 32
epochs = 10 

max_len = 128
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(df['excerpt_value_cleaned'])
sequences = tokenizer.texts_to_sequences(df['excerpt_value_cleaned'])
padded_sequences = pad_sequences(sequences, maxlen=max_len)

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['plutchik_emotion'])

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE_INT)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VALIDATE_SIZE, random_state=RANDOM_STATE_INT)

def load_fasttext_embeddings(fasttext_file_path, embedding_dim, tokenizer_word_index):
    embeddings_index = {}
    with open(fasttext_file_path, 'r', encoding='utf8') as f:
        next(f)  # Skip the header row
        for line in f:
            values = line.rstrip().split(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    embedding_matrix = np.zeros((len(tokenizer_word_index) + 1, embedding_dim))
    for word, i in tokenizer_word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

embedding_dim = 300 
fasttext_file_path = 'crawl-300d-2M.vec'
embedding_matrix = load_fasttext_embeddings(fasttext_file_path, embedding_dim, tokenizer.word_index)

dropout_rate = 0.5

input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(len(tokenizer.word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_layer)
bi_lstm = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)

attention = Attention()([bi_lstm, bi_lstm])
context_vector = GlobalAveragePooling1D()(attention)

dropout_layer = Dropout(dropout_rate)(context_vector)
dense_layer_1 = Dense(64, activation='relu')(dropout_layer)
dropout_layer_2 = Dropout(dropout_rate)(dense_layer_1)
output_layer = Dense(8, activation='softmax')(dropout_layer_2)  # 8 emotion classes from Plutchik's wheel of emotions

model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=epochs, callbacks=[early_stopping])

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
f1_w = f1_score(y_test, y_pred_classes, average='weighted')
recall_w = recall_score(y_test, y_pred_classes, average='weighted')
precision_w = precision_score(y_test, y_pred_classes, average='weighted')
f1_m = f1_score(y_test, y_pred_classes, average='macro')
recall_m = recall_score(y_test, y_pred_classes, average='macro')
precision_m = precision_score(y_test, y_pred_classes, average='macro')

print("Accuracy:", accuracy)
print("F1 Score (Weighted):", f1_w)
print("F1 Score (Macro):", f1_m)
print("Recall (Weighted):", recall_w)
print("Recall (Macro):", recall_m)
print("Precision (Weighted):", precision_w)
print("Precision (Macro):", precision_m)

cm = confusion_matrix(y_test, y_pred_classes)
print(cm)

emotion_categories = label_encoder.classes_

plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=emotion_categories, yticklabels=emotion_categories)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


# BERT or MacBERTh

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Bidirectional, LSTM, Dense, Dropout, Attention, GlobalAveragePooling1D, Lambda
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
    print(f'{len(physical_devices)} GPU(s) detected: {[device.name for device in physical_devices]}')
else:
    print('No GPU detected, using CPU.')

df = pd.read_csv('/kaggle/input/msc-thesis-dataset/english_only.csv') 

TEST_SIZE = 0.15
VALIDATE_SIZE = 0.1765
RANDOM_STATE_INT = 14988828
batch_size = 32
epochs = 10  

max_len = 256
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts, tokenizer, max_len):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_len)
    return np.array(encodings['input_ids'])

encoded_sequences = encode_texts(df['excerpt_value_cleaned'], tokenizer, max_len)

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df['plutchik_emotion'])

X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, labels, test_size=TEST_SIZE, random_state=RANDOM_STATE_INT)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=VALIDATE_SIZE, random_state=RANDOM_STATE_INT)

bert_model = TFBertModel.from_pretrained('bert-base-uncased')

dropout_rate = 0.5

input_layer = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
bert_outputs = Lambda(lambda x: bert_model(x)[0], output_shape=(max_len, 768))(input_layer)
bi_lstm = Bidirectional(LSTM(128, return_sequences=True))(bert_outputs)

attention = Attention()([bi_lstm, bi_lstm])
context_vector = GlobalAveragePooling1D()(attention)

dropout_layer = Dropout(dropout_rate)(context_vector)
dense_layer_1 = Dense(64, activation='relu')(dropout_layer)
dropout_layer_2 = Dropout(dropout_rate)(dense_layer_1)
output_layer = Dense(8, activation='softmax')(dropout_layer_2)
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=batch_size, epochs=epochs, callbacks=[early_stopping])

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
f1_w = f1_score(y_test, y_pred_classes, average='weighted')
recall_w = recall_score(y_test, y_pred_classes, average='weighted')
precision_w = precision_score(y_test, y_pred_classes, average='weighted')
f1_m = f1_score(y_test, y_pred_classes, average='macro')
recall_m = recall_score(y_test, y_pred_classes, average='macro')
precision_m = precision_score(y_test, y_pred_classes, average='macro')

print("Accuracy:", accuracy)
print("F1 Score (Weighted):", f1_w)
print("F1 Score (Macro):", f1_m)
print("Recall (Weighted):", recall_w)
print("Recall (Macro):", recall_m)
print("Precision (Weighted):", precision_w)
print("Precision (Macro):", precision_m)

cm = confusion_matrix(y_test, y_pred_classes)
print(cm)

emotion_categories = label_encoder.classes_

plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=emotion_categories, yticklabels=emotion_categories)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()
