In [1]:
import datetime
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from transformers import BertTokenizer
from nltk.util import ngrams
import spacy
import re

In [2]:
# Load Data
train = pd.read_csv('../public_data/train/track_a/eng.csv')
val = pd.read_csv('../public_data/dev/track_a/eng_a.csv')
emotions = ['Joy', 'Sadness', 'Surprise', 'Fear', 'Anger']
emolex_path = "../EmoLex/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"

In [3]:
# Load EmoLex Lexicon
def load_emolex(emolex_path):
    emolex = pd.read_csv(emolex_path, sep='\t', header=None, names=["Word", "Emotion", "Association"])
    emotion_dict = {}
    for _, row in emolex.iterrows():
        if row["Association"] == 1:
            word = row["Word"]
            emotion = row["Emotion"]
            if word not in emotion_dict:
                emotion_dict[word] = []
            emotion_dict[word].append(emotion)
    return emotion_dict

emotion_dict = load_emolex(emolex_path)

# Preprocessing Config
config = {'sep_pn': True, 'rm_pn': False, 'apply_lemmatization': True, 'apply_stemming': True, 'add_bigrams': True, 'rm_sw': False}

# Preprocessing Functions
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
nlp = spacy.load("en_core_web_sm")

In [4]:
def pre_process(text, config, target_emotion=None, emotion_dict=None):
    def separate_punctuation(text):
        text = re.sub(r"(\w)([.,;:!?\'\"”\)])", r"\1 \2", text)
        text = re.sub(r"([.,;:!?\'\"“\(\)])(\w)", r"\1 \2", text)
        return text

    def remove_punctuation(text):
        text = re.sub(r"[.,;:!?\'\"“”\(\)]", "", text)
        return text

    def tokenize_text(text):
        encoded_input = tokenizer(text, return_tensors='pt', add_special_tokens=True)
        tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
        return tokens

    def apply_stemming(tokens):
        stemmer = PorterStemmer()
        return [stemmer.stem(token) for token in tokens]

    def apply_lemmatization(tokens):
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(token) for token in tokens]

    def generate_ngrams_from_tokens(tokens, n):
        return [" ".join(gram) for gram in ngrams(tokens, n)]

    # Apply config options
    if config['sep_pn'] and not config['rm_pn']:
        text = separate_punctuation(text)
    if config['rm_pn'] and not config['sep_pn']:
        text = remove_punctuation(text)

    tokens = tokenize_text(text)
    if config['apply_stemming']:
        tokens = apply_stemming(tokens)
    if config['apply_lemmatization']:
        tokens = apply_lemmatization(tokens)
    if config['add_bigrams']:
        tokens += generate_ngrams_from_tokens(tokens, 2)
    if config['rm_sw']:
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word.lower() not in stop_words]

    processed_text = " ".join(tokens)

    if target_emotion and emotion_dict:
        relevant_keywords = [word for word in tokens if target_emotion in emotion_dict.get(word, [])]
        if relevant_keywords:
            processed_text += f" [SEP] {' '.join(relevant_keywords)}"
        else:
            processed_text += " [SEP]"

    return processed_text

# Preprocess and Extract Features
vectorizer = CountVectorizer()

def preprocess_dataset_with_emotions(dataset, emotions, config, emotion_dict):
    augmented_data = {}
    for emotion in emotions:
        augmented_data[emotion] = [
            pre_process(text, config, target_emotion=emotion, emotion_dict=emotion_dict)
            for text in dataset
        ]
    return augmented_data

train_augmented = preprocess_dataset_with_emotions(train["text"], emotions, config, emotion_dict)
val_augmented = preprocess_dataset_with_emotions(val["text"], emotions, config, emotion_dict)

X_train = {emotion: vectorizer.fit_transform(train_augmented[emotion]).toarray() for emotion in emotions}
X_val = {emotion: vectorizer.transform(val_augmented[emotion]).toarray() for emotion in emotions}

# POS Tagging
def extract_pos_tags(texts):
    return [[token.pos_ for token in nlp(text)] for text in texts]

train_pos_tags = extract_pos_tags(train["text"])
val_pos_tags = extract_pos_tags(val["text"])

# POS Encoding
max_length = max(max(len(tags) for tags in train_pos_tags), max(len(tags) for tags in val_pos_tags))
train_pos_tags = [tags + ['PAD'] * (max_length - len(tags)) for tags in train_pos_tags]
val_pos_tags = [tags + ['PAD'] * (max_length - len(tags)) for tags in val_pos_tags]

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
train_pos_encoded = encoder.fit_transform(train_pos_tags)
val_pos_encoded = encoder.transform(val_pos_tags)

# Combine Features
combined_features = {
    emotion: np.concatenate((X_train[emotion], train_pos_encoded), axis=1) for emotion in emotions
}
validation_combined_features = {
    emotion: np.concatenate((X_val[emotion], val_pos_encoded), axis=1) for emotion in emotions
}

# Logistic Regression for Enhanced Features
y_train = train[emotions].values
lr_models = {}
lr_features = {}
val_lr_features = {}

for emotion in emotions:
    lr = LogisticRegression(max_iter=1000)
    lr.fit(combined_features[emotion], y_train[:, emotions.index(emotion)])
    lr_models[emotion] = lr
    lr_features[emotion] = lr.predict_proba(combined_features[emotion])
    val_lr_features[emotion] = lr.predict_proba(validation_combined_features[emotion])

final_train_features = {
    emotion: np.concatenate((combined_features[emotion], lr_features[emotion]), axis=1) for emotion in emotions
}
final_val_features = {
    emotion: np.concatenate((validation_combined_features[emotion], val_lr_features[emotion]), axis=1) for emotion in emotions
}

In [5]:
# Neural Network
model = nn.Sequential(
    nn.Linear(final_train_features[emotions[0]].shape[1], 128),
    nn.BatchNorm1d(128),
    nn.ReLU(),
    nn.Dropout(0.4),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 1)
)

In [6]:
# DataLoader
for emotion in emotions:
    features_tensor = torch.tensor(final_train_features[emotion], dtype=torch.float32)
    labels_tensor = torch.tensor(y_train[:, emotions.index(emotion)], dtype=torch.float32).unsqueeze(1)
    dataset = TensorDataset(features_tensor, labels_tensor)
    data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

    # Calculate class weights
    class_count = y_train[:, emotions.index(emotion)].sum()
    total_count = y_train.shape[0]
    weights = total_count / class_count

    # Loss and Optimizer
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([weights], dtype=torch.float32))
    optimizer = optim.SGD(model.parameters(), lr=1e-4, weight_decay=1e-4)

    # Training Loop
    losses = []
    for epoch in tqdm(range(51), desc=f"Training Loop ({emotion})"):
        for features, labels in data_loader:
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        if epoch % 10 == 0:
            print(f"Epoch {epoch} ({emotion}): Loss: {round(loss.item(),3)}")
            torch.save(model.state_dict(), f'./15-1-25/{emotion}_net_epoch_{epoch}.pth')
            losses.append(round(loss.item(),3))
        if epoch == 50:
            print(f"Epoch {epoch} ({emotion}): Loss: {round(loss.item(),3)}")
            torch.save(model.state_dict(), f'./15-1-25/{emotion}_net_epoch_{epoch}.pth')
            losses.append(round(loss.item(),3))

Training Loop (Joy):   2%|▏         | 1/51 [00:04<03:32,  4.25s/it]

Epoch 0 (Joy): Loss: 0.815


Training Loop (Joy):  22%|██▏       | 11/51 [00:34<01:58,  2.96s/it]

Epoch 10 (Joy): Loss: 0.958


Training Loop (Joy):  41%|████      | 21/51 [01:03<01:29,  2.98s/it]

Epoch 20 (Joy): Loss: 1.356


Training Loop (Joy):  61%|██████    | 31/51 [01:36<01:07,  3.39s/it]

Epoch 30 (Joy): Loss: 1.014


Training Loop (Joy):  80%|████████  | 41/51 [01:58<00:20,  2.07s/it]

Epoch 40 (Joy): Loss: 1.043


Training Loop (Joy): 100%|██████████| 51/51 [02:18<00:00,  2.71s/it]


Epoch 50 (Joy): Loss: 0.665
Epoch 50 (Joy): Loss: 0.665


Training Loop (Sadness):   2%|▏         | 1/51 [00:02<01:44,  2.08s/it]

Epoch 0 (Sadness): Loss: 0.761


Training Loop (Sadness):  22%|██▏       | 11/51 [00:24<01:41,  2.53s/it]

Epoch 10 (Sadness): Loss: 0.502


Training Loop (Sadness):  41%|████      | 21/51 [00:48<01:04,  2.14s/it]

Epoch 20 (Sadness): Loss: 0.404


Training Loop (Sadness):  61%|██████    | 31/51 [01:09<00:40,  2.02s/it]

Epoch 30 (Sadness): Loss: 0.31


Training Loop (Sadness):  80%|████████  | 41/51 [01:29<00:21,  2.14s/it]

Epoch 40 (Sadness): Loss: 0.219


Training Loop (Sadness): 100%|██████████| 51/51 [01:49<00:00,  2.16s/it]


Epoch 50 (Sadness): Loss: 0.457
Epoch 50 (Sadness): Loss: 0.457


Training Loop (Surprise):   2%|▏         | 1/51 [00:02<01:42,  2.05s/it]

Epoch 0 (Surprise): Loss: 0.187


Training Loop (Surprise):  22%|██▏       | 11/51 [00:31<01:59,  3.00s/it]

Epoch 10 (Surprise): Loss: 0.219


Training Loop (Surprise):  41%|████      | 21/51 [00:50<01:00,  2.02s/it]

Epoch 20 (Surprise): Loss: 0.117


Training Loop (Surprise):  61%|██████    | 31/51 [01:15<00:55,  2.78s/it]

Epoch 30 (Surprise): Loss: 0.128


Training Loop (Surprise):  80%|████████  | 41/51 [01:45<00:32,  3.24s/it]

Epoch 40 (Surprise): Loss: 0.101


Training Loop (Surprise): 100%|██████████| 51/51 [02:05<00:00,  2.46s/it]


Epoch 50 (Surprise): Loss: 0.067
Epoch 50 (Surprise): Loss: 0.067


Training Loop (Fear):   2%|▏         | 1/51 [00:01<01:35,  1.91s/it]

Epoch 0 (Fear): Loss: 0.072


Training Loop (Fear):  22%|██▏       | 11/51 [00:21<01:17,  1.95s/it]

Epoch 10 (Fear): Loss: 0.11


Training Loop (Fear):  41%|████      | 21/51 [00:41<01:01,  2.06s/it]

Epoch 20 (Fear): Loss: 0.101


Training Loop (Fear):  61%|██████    | 31/51 [01:01<00:39,  1.98s/it]

Epoch 30 (Fear): Loss: 0.075


Training Loop (Fear):  80%|████████  | 41/51 [01:29<00:30,  3.07s/it]

Epoch 40 (Fear): Loss: 0.068


Training Loop (Fear): 100%|██████████| 51/51 [02:01<00:00,  2.38s/it]


Epoch 50 (Fear): Loss: 0.17
Epoch 50 (Fear): Loss: 0.17


Training Loop (Anger):   2%|▏         | 1/51 [00:03<02:33,  3.07s/it]

Epoch 0 (Anger): Loss: 0.138


Training Loop (Anger):  22%|██▏       | 11/51 [00:33<02:00,  3.01s/it]

Epoch 10 (Anger): Loss: 0.053


Training Loop (Anger):  41%|████      | 21/51 [01:05<01:36,  3.22s/it]

Epoch 20 (Anger): Loss: 0.051


Training Loop (Anger):  61%|██████    | 31/51 [01:38<01:06,  3.34s/it]

Epoch 30 (Anger): Loss: 0.087


Training Loop (Anger):  80%|████████  | 41/51 [02:13<00:34,  3.48s/it]

Epoch 40 (Anger): Loss: 0.02


Training Loop (Anger): 100%|██████████| 51/51 [02:45<00:00,  3.24s/it]

Epoch 50 (Anger): Loss: 0.05
Epoch 50 (Anger): Loss: 0.05





In [8]:
def get_predictions(X_val, model, threshold=0.5):
    sig = nn.Sigmoid()
    yhat = sig(model(X_val)).detach().numpy()
    y_pred = yhat > threshold
    return y_pred

for emotion in emotions:
    for i in range(5):
        epoch = i*10
        model.load_state_dict(torch.load(f'./15-1-25/{emotion}_net_epoch_{epoch}.pth', weights_only=True))
        y_pred = get_predictions(torch.Tensor(final_val_features[emotion]), model, 0.45)

        val_data_with_pred = pd.DataFrame(y_pred, columns=[emotion])
        val_data_with_pred["id"] = val["id"]

        val_data_with_pred = val_data_with_pred[["id", emotion]]

        current_time = datetime.datetime.now()
        formatted_time = current_time.strftime('%Y-%m-%d_%H_%M_%S')

        val_data_with_pred.to_csv(f'../results/alt4_emolex/{emotion}_epoch_{epoch}_pred_eng_a_{formatted_time}.csv', index=False)

        print(val_data_with_pred)

                        id    Joy
0    eng_dev_track_a_00001  False
1    eng_dev_track_a_00002   True
2    eng_dev_track_a_00003  False
3    eng_dev_track_a_00004   True
4    eng_dev_track_a_00005   True
..                     ...    ...
111  eng_dev_track_a_00112   True
112  eng_dev_track_a_00113   True
113  eng_dev_track_a_00114   True
114  eng_dev_track_a_00115   True
115  eng_dev_track_a_00116   True

[116 rows x 2 columns]
                        id   Joy
0    eng_dev_track_a_00001  True
1    eng_dev_track_a_00002  True
2    eng_dev_track_a_00003  True
3    eng_dev_track_a_00004  True
4    eng_dev_track_a_00005  True
..                     ...   ...
111  eng_dev_track_a_00112  True
112  eng_dev_track_a_00113  True
113  eng_dev_track_a_00114  True
114  eng_dev_track_a_00115  True
115  eng_dev_track_a_00116  True

[116 rows x 2 columns]
                        id   Joy
0    eng_dev_track_a_00001  True
1    eng_dev_track_a_00002  True
2    eng_dev_track_a_00003  True
3    eng_dev_tra