In [26]:
import json
from datasets import Dataset
import numpy as np
from transformers import BertTokenizer
from datasets import DatasetDict
import torch
from torch import nn
from transformers import BertModel
from transformers import TrainingArguments, Trainer
from typing_extensions import final


In [27]:
def train_test_split(inputs,test_size,seed = 0):
    """
    Splits the data into training and test sets.
    Return 4 numpy arrays. X_train, X_test, Y_train, Y_test
    where training data is test_size proportion of data provided.

    Args:
        inputs [np.array] : numpy array of input data
        outputs [np.array]: numpy array of output labels
        test_size [float]: proportion of data to be used as test data. e.g. 0.2 means 20% of data is used for training.
        seed [int]: A seed to create random number generator. (For reproducability)
    """
    rng = np.random.default_rng(seed)
    assert(test_size <= 1.0)
    assert(test_size >= 0.0)
    num_samples = len(inputs)
    num_train = int(num_samples * (1.0 - test_size))
    # Create index using np.arange
    # Shuffle the dataset using rng.shuffle
    # Use help function to understand rng.shuffle
    # Split the dataset into train, test
    # BEGIN SOLUTION

    index = np.arange(0,num_samples,1)
    rng.shuffle(index)
    num_cols = int(num_samples*test_size) #gives u how many cols to be used in the training.
    index_training = index[:num_cols]
    index_testing = index[num_cols:]

    training = [inputs[i] for i in index_training]
    testing = [inputs[i] for i in index_testing]


    # END SOLUTION
    return training, testing

In [28]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["Text"], padding="max_length", truncation=True)

In [29]:
class BertForVA(nn.Module):
    def __init__(self, base="bert-base-uncased"):
        super().__init__()
        self.bert = BertModel.from_pretrained(base)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 2)

    def forward(self, input_ids, attention_mask, labels=None):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = out.pooler_output
        preds = self.regressor(pooled)
        loss = None
        if labels is not None:
            loss = nn.MSELoss()(preds, labels)
        return {"loss": loss, "logits": preds}   # <- important key name


In [30]:
def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.tensor([[item["V"], item["A"]] for item in batch], dtype=torch.float)
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [31]:
data = []
with open("//content//sample_data//eng_laptop_train_alltasks.jsonl", "r") as f:
    for line in f:
        obj = json.loads(line)
        data.append(obj)

final_data = []

for i in data:
  temp ={}
  temp["Text"] = i['Text']
  temp_va = i["Quadruplet"][0]["VA"].split("#")
  temp["V"] = float(temp_va[0])
  temp["A"] = float(temp_va[1])
  final_data.append(temp)

# training_data, testing_data = train_test_split(final_data,0.8,seed = 0)

dataset = Dataset.from_list(final_data)
print(dataset)



def add_labels(batch):
    # batch["V"] and batch["A"] are lists/arrays in batched mode
    labels = np.stack([batch["V"], batch["A"]], axis=1)  # shape (bs, 2)
    batch["labels"] = labels.tolist()
    return batch

dataset = dataset.map(add_labels, batched=True)

# keep only the columns the model/Trainer expects
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


dataset = dataset.train_test_split(test_size=0.2, seed=42)  # 80% train, 20% test
dataset = DatasetDict({
    "train": dataset["train"],
    "validation": dataset["test"]
})

print(dataset["train"].features)

# 2) TrainingArguments: keep as-is, no special flags needed
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    eval_strategy="epoch",   # you’re on 4.56, so eval_strategy is correct
    save_strategy="epoch",
    report_to="none",        # avoid W&B prompt
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    # remove_unused_columns can stay default True
)


model = BertForVA()

# 3) Trainer: remove data_collator (the default works now)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    # no data_collator needed
)



trainer.train()

results = trainer.evaluate()
print(results)

preds = trainer.predict(dataset["validation"])
# print(preds.predictions[:5])  # first 5 predictions

for i in range(5):  # first 5 examples
    print(f"Text: {dataset['validation'][i]['Text']}")
    print(f"Predicted: {preds.predictions[i]}")
    print(f"Actual:    {preds.label_ids[i]}")
    print("-" * 50)







Dataset({
    features: ['Text', 'V', 'A'],
    num_rows: 4076
})


Map:   0%|          | 0/4076 [00:00<?, ? examples/s]

ValueError: Columns ['attention_mask', 'input_ids'] not in the dataset. Current columns in the dataset: ['Text', 'V', 'A', 'labels']