In [None]:
!pip install torch 

In [None]:
!pip install nltk

In [None]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import recall_score, precision_score, f1_score

import torch 
from torch import nn
from torch import optim

import datetime

In [2]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import ngrams
from sklearn import metrics
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
import nltk

nltk.download("wordnet")
nltk.download("punkt")
nltk.download('omw-1.4')
nltk.download('stopwords')

In [None]:
# Load the training and validation data

train = pd.read_csv('../public_data/train/track_a/eng.csv')
val = pd.read_csv('../public_data/dev/track_a/eng_a.csv')

train.head()

In [5]:
def pre_process(text):
    """ 
    Performs Different preprocessing operations.

    Parameters:
    text (string): passes a line of text (assume sentence segmentation has already been done)

    Returns:
    List[string]: Should return a list of tokens.
    """

    def separate_punctuation(text):
        text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
        text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\1 \2", text)
        return text

    def remove_punctuation(text):
        text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1", text)
        text = re.sub(r"([.,;:!?'\"“\(\)])(\w)", r"\2", text)
        return text
        
    def tokenize_text(text):
        tokens = re.split(r"\s+",text)
        tokens = [t.lower() for t in tokens]
        return tokens

    def apply_stemming(tokens):
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        return stemmed_tokens

    def apply_lemmatization(tokens):
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens

    def generate_ngrams_from_tokens(tokens, n):
        return list(ngrams(tokens, n))


    # Separate Punctuation otherwise Remove it
    
    # text = separate_punctuation(text)
    text = remove_punctuation(text)
    
    # tokenize text
    
    tokens = tokenize_text(text)

    # Apply Lemmatization or Stemming

    # tokens = apply_stemming(tokens)
    tokens = apply_lemmatization(tokens)


    # Generate bigrams, trigrams and quadgrams
    bigrams = generate_ngrams_from_tokens(tokens, 2)
    bg = [i + " " + j for (i,j) in bigrams]
    tokens += bg

    # trigrams = generate_ngrams_from_tokens(tokens, 3)
    # tg = [i + " " + j + " " + k for (i,j,k) in trigrams]
    # tokens += tg

    # quadgrams = generate_ngrams_from_tokens(tokens, 4)
    # qg = [i + " " + j + " " + k + " " + l for (i,j,k,l) in quadgrams]
    # tokens += qg

    # Remove Stop words

    # stop_words = set(stopwords.words('english'))
    # tokens = [w for w in tokens if w not in stop_words]

    return " ".join(tokens)

In [None]:
[pre_process(i) for i in train["text"]]

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2))
X_train = vectorizer.fit_transform([pre_process(i) for i in train["text"]]).toarray()
X_val = vectorizer.transform(val['text'].str.lower()).toarray()

emotions = ['Joy','Sadness','Surprise','Fear','Anger']
y_train = train[emotions].values
y_val = val[emotions].values

print(val)

In [8]:
X_train_t = torch.Tensor(X_train)
y_train_t = torch.Tensor(y_train)

X_val_t = torch.Tensor(X_val)
y_val_t = torch.Tensor(y_val)

In [None]:
print(f'Shape of X: {X_train.shape}')
print(f'Shape of y: {y_train.shape}')
print(f'Number of positives per emotion class:')
_ = [print(f' - {e}: {v} ({round(100*v/len(y_train))}%)') for e,v in zip(emotions, y_train.sum(axis=0))]

In [10]:
model = nn.Sequential(
          nn.Linear(X_train.shape[1], 100),
          nn.ReLU(),
          nn.Dropout(0.3),
          nn.Linear(100, y_train.shape[1])
        )

In [11]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-1, weight_decay=1e-2)

In [None]:
# Train for a set number of epochs
for epoch in range(1000):
    optimizer.zero_grad()
    output = model(X_train_t)
    loss = criterion(output, y_train_t)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f'Epoch {epoch}: Loss: {round(loss.item(),3)}')

In [13]:
def get_predictions(X_val, model, threshold=0.5):
    sig = nn.Sigmoid() 
    yhat = sig(model(X_val)).detach().numpy()
    y_pred = yhat > threshold
    
    return y_pred

In [None]:
y_pred = get_predictions(X_val_t, model, 0.45)
# print(y_pred)

# Create a DataFrame to save to CSV
val_data_with_pred = pd.DataFrame(y_pred, columns=['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'])  # Adjust column names as per your features
# val_data_with_pred['True_Label'] = y_test
# val_data_with_pred['Predictions'] = dummy_predictions

val_data_with_pred = val_data_with_pred.astype(int)

val_data_with_pred['id'] = val['id']

val_data_with_pred = val_data_with_pred[['id', 'Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']]

# Save to CSV
current_time = datetime.datetime.now()
formatted_time = current_time.strftime('%Y-%m-%d %H:%M:%S')

val_data_with_pred.to_csv(f'../results/pred_eng_a_{formatted_time}.csv', index=False)

print(val_data_with_pred)

In [15]:
def evaluate(y_val, y_pred):
    for average in ['micro', 'macro']:
        recall = recall_score(y_val, y_pred, average=average, zero_division=0)
        precision = precision_score(y_val, y_pred, average=average, zero_division=0)
        f1 = f1_score(y_val, y_pred, average=average, zero_division=0)
    
        print(f'{average.upper()} recall: {round(recall, 4)}, precision: {round(precision, 4)}, f1: {round(f1, 4)}')

In [16]:
# evaluate(y_val, y_pred) EVALS WON'T WORK HERE 

In [17]:
def evaluate_per_class(y_val, y_pred):
    for i, emotion in enumerate(emotions):
        print(f'*** {emotion} ***')
    
        recall = recall_score(y_val[:,i], y_pred[:,i], zero_division=0)
        precision = precision_score(y_val[:,i], y_pred[:,i], zero_division=0)
        f1 = f1_score(y_val[:,i], y_pred[:,i], zero_division=0)
        
        print(f'recall: {round(recall, 4)}, precision: {round(precision, 4)}, f1: {round(f1, 4)}\n')

In [18]:
# evaluate_per_class(y_val, y_pred) EVALS WON'T WORK HERE 

In [19]:
weights = y_train.sum(axis=0)/y_train.sum()
weights = max(weights)/weights

In [None]:
# Define model 
model = nn.Sequential(
          nn.Linear(X_train.shape[1], 100),
          nn.ReLU(),
          nn.Dropout(0.3),
          nn.Linear(100, y_train.shape[1])
        )

# Define training parameters
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.Tensor(weights)) # <-- weights assigned to optimiser
optimizer = optim.SGD(model.parameters(), lr=1e-1, weight_decay=1e-2)

# Train for a number of epochs
for epoch in range(1000):
    optimizer.zero_grad()
    output = model(X_train_t)
    loss = criterion(output, y_train_t)
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f'Epoch {epoch}: Loss: {round(loss.item(),3)}')

# Get predictions
y_pred = get_predictions(X_val_t, model, 0.45)

# Create a DataFrame to save to CSV
val_data_with_pred = pd.DataFrame(y_pred, columns=['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'])  # Adjust column names as per your features
# val_data_with_pred['True_Label'] = y_test
# val_data_with_pred['Predictions'] = dummy_predictions

val_data_with_pred = val_data_with_pred.astype(int)

val_data_with_pred['id'] = val['id']

val_data_with_pred = val_data_with_pred[['id', 'Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']]

# Save to CSV
current_time = datetime.datetime.now()
formatted_time = current_time.strftime('%Y-%m-%d %H:%M:%S')

val_data_with_pred.to_csv(f'../results/pred_eng_a_{formatted_time}.csv', index=False)

print(val_data_with_pred)

# Evaluate: EVALS WON'T WORK HERE 
# print('\n\nEVALUATION\n')
# evaluate(y_val, y_pred)

# print('\nPER CLASS BREAKDOWN\n')
# evaluate_per_class(y_val, y_pred)

In [None]:
val_data_with_pred