In [1]:
%load_ext autoreload
%autoreload 2

import os
import random
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


from torch.utils.data import DataLoader, TensorDataset

from Hyperparameters.Embeddings.BertTokenEmbedder import BertTokenEmbedder
from Hyperparameters.Dataloader.EmbeddingDataset import EmbeddingDataset
from Hyperparameters.Dataloader.collate_fn import collate_fn
from Hyperparameters.Models.BertPreTrainedClassifier import BertPreTrainedClassifier
from Hyperparameters.Training.ActiveLearningLoop import active_learning_loop
from Hyperparameters.Training.ActiveLearningLoop import query_entropy

from Hyperparameters.Utils.Misc import get_device



Registering Model: SimpleModel (enforce_clean=False)
Git Info:
  User      : bbalfou <b_balfou@yahoo.com>
  Commit    : 6c892749524f2f9246e08a85a69b7ee6f8028838
  Branch    : bruce
  File link : https://github.com/bennellis/CIL_Sentiment_analysis/blob/6c892749524f2f9246e08a85a69b7ee6f8028838/Hyperparameters\Models\ModelDummy.py


In [2]:
model_name="FacebookAI/roberta-base"
csv_path="data/Sentiment/training.csv"
seed = 42

lr = 1e-3
class_order = [0,1,2]
lr_top = 5e-5
lr_mid = 3e-5
lr_bot = 2e-5
dropout = 0.4
temperature = 0.5
ce_weight = 0.1

In [3]:
df = pd.read_csv(csv_path, index_col=0)
label_map = {'negative': -1, 'neutral': 0, 'positive': 1}
df['label_encoded'] = df['label'].map(label_map)

In [4]:

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['sentence'], df['label_encoded'],
    stratify=df['label_encoded'], test_size=0.1, random_state=seed
)

In [5]:
embedder = BertTokenEmbedder(model_name)
features = embedder.fit_transform(df['sentence'].to_list())
labels = df['label_encoded'].to_numpy()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing batches:   0%|          | 0/3191 [00:00<?, ?it/s]

In [6]:


if embedder.is_variable_length:
    feature_dataset = EmbeddingDataset(features, labels)

    cache_name= model_name.replace("/", "_")
    cache_path = "cache/" + cache_name
    emb_dataset_path = cache_path + "emb_dataset.pt"


    if os.path.exists(emb_dataset_path):
        embedded_feature_dataset = torch.load(emb_dataset_path, weights_only=False)
    else:
        feature_dataloader = DataLoader(feature_dataset, batch_size=8,collate_fn=collate_fn)
        embedded_feature_dataset = embedder.embed_dataset(feature_dataloader)
        os.makedirs("cache", exist_ok=True)
        torch.save(embedded_feature_dataset, emb_dataset_path)

else:
    raise Exception("blaalalal")

In [7]:
model = BertPreTrainedClassifier(
    model_name = model_name,
    lr = lr,
    pt_lr_bot = lr_bot,
    pt_lr_mid = lr_mid,
    pt_lr_top = lr_top,
    class_order = class_order,
    ce_weight = ce_weight,
    temperature = temperature,
    frozen = True,
    custom_ll = True
)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:

from Hyperparameters.Training.ActiveLearningLoop import active_learning_loop
from Hyperparameters.Training.ActiveLearningLoop import query_entropy
active_learning_loop(
        model,
        get_device(),
        embedded_feature_dataset,
        query_entropy,
        max_rounds=1000,
        query_batch_size=1000,
        train_epochs_per_round=3,
        initial_label_count=1000,
        val_split=0.2,
        batch_size=16
)

[INFO] Using device: NVIDIA GeForce RTX 3080

🔁 Round 1/1000 — Labeled: 1000
Training BertPreTrainedClassifier on cuda


Training:   0%|          | 0/32 [00:00<?, ?batch/s]

KeyError: 0