In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

import custom_dataloader
import embeddings
import models

In [2]:
training_data = pd.read_csv('data/training.csv', index_col=0)
label_mapping = {'negative': -1, 'neutral': 0, 'positive': 1}
training_data['label_encoded'] = training_data['label'].map(label_mapping)

sentences = training_data['sentence']
labels = training_data['label_encoded']

train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    sentences, labels, test_size=0.1, stratify=labels, random_state=42)

In [None]:
embedder = embeddings.BertTokenEmbedder('FacebookAI/roberta-base')  # example model

n_samples = 10  # set to -1 to get ALL samples
n_val_samples = int(n_samples/10) if n_samples != -1 else -1

X_train = embedder.fit_transform(list(train_sentences)[:n_samples])
X_val = embedder.transform(list(val_sentences)[:n_val_samples])
Y_train = np.array(train_labels[:n_samples])
Y_val = np.array(val_labels[:n_val_samples])

In [4]:
dataset_train = custom_dataloader.EmbeddingDataset(X_train, Y_train)
dataset_val = custom_dataloader.EmbeddingDataset(X_val, Y_val)

In [5]:
train_sampler = custom_dataloader.DynamicUnderSampler(Y_train, random_state=42)

train_loader = DataLoader(
    dataset_train,
    sampler=train_sampler,    # <- here
    batch_size=8,
    collate_fn=custom_dataloader.collate_fn
)
val_loader = DataLoader(dataset_val, batch_size=64, collate_fn=custom_dataloader.collate_fn)


In [6]:
emb_train_loader = embedder.precompute_embeddings_token_level(train_loader)
emb_val_loader = embedder.precompute_embeddings_token_level(val_loader, val=True)

                                                                    

In [None]:
input_dim = next(iter(emb_train_loader))[0].shape[-1]  # get embedding dimension
model = models.BiRNNClassifier(
    input_dim=input_dim,
    hidden_dim=256,   
    num_layers=3,
    dropout=0.2,
    lr=0.0005
)

model.fit(emb_train_loader, emb_val_loader, epochs=5)

In [None]:
input_dim = next(iter(emb_train_loader))[0].shape[-1]
model = models.TextCNNClassifier(input_dim=input_dim, lr=0.001, kernel_sizes=[2,4,6,8], num_filters=128)
model.fit(emb_train_loader, emb_val_loader, epochs=5)

In [None]:
from sklearn.metrics import mean_absolute_error, confusion_matrix

Y_val_pred = model.predict(emb_val_loader)
mae_val = mean_absolute_error(Y_val, Y_val_pred)
L_score_val = 0.5 * (2 - mae_val)
print(f"Validation Score (L): {L_score_val:.5f}")

conf_matrix = confusion_matrix(Y_val, Y_val_pred, labels=[-1, 0, 1])
print(conf_matrix)