'''
Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JoengYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.
'''

In [1]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

from nn.utils.init import set_random_seed
set_random_seed(0)

In [2]:
# https://www.kaggle.com/datasets/tarkkaanko/amazon/

import pandas as pd
df = pd.read_csv(path_append + '../data/Amazon reviews/amazon_reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel

class PretrainedModelDataset(Dataset):
    def __init__(self, df, tokenizer, model, num_classes, device, max_length=512, precompute_batches=64, **kwargs):
        self.df = df.copy()
        self.df['overall'] -= 1  # Shift labels to 0-4 range
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_classes = num_classes
        self.device = device
        self.precompute_batches = precompute_batches
        self.X_cache = None
        self.y_cache = None
        self.total_iters = 0
        self.dataset_length = len(self.df)
        # Efficiently select a random subset of indices
        self.batch_indices = torch.randperm(len(self.df))
        self._precompute_batches(0)
        
    def _precompute_batches(self, start_idx):
        end_idx = min(start_idx + self.precompute_batches, self.dataset_length)
        batch_indices = self.batch_indices[start_idx:end_idx].tolist()

        # Gather batch data
        X_batch = [self.df.iloc[i]["reviewText"] for i in batch_indices]
        y_batch = [self.df.iloc[i]["overall"] for i in batch_indices]

        # Ensure all elements in X_batch are strings
        X_batch = [str(x) for x in X_batch]

        # Ensure y_batch contains valid numeric values
        y_batch = [float(y) for y in y_batch]
        
        # Tokenize the batch
        X = self.tokenizer(X_batch, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        # Move inputs to the correct device
        input_ids = X['input_ids'].to(self.device)
        attention_mask = X['attention_mask'].to(self.device)

        # Get the last hidden state from the RoBERTa model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.last_hidden_state
        expanded_attention_mask = attention_mask.unsqueeze(-1)
        
        # Reshape the labels for the entire batch
        y_tensor = torch.tensor(y_batch, dtype=torch.float).unsqueeze(1)  # Shape: (batch_size, 1)
        y_tensor = y_tensor.unsqueeze(1).repeat(1, last_hidden_state.size(1), 1)  # Shape: (batch_size, sequence_length, num_classes)
        
        last_hidden_state[expanded_attention_mask.expand_as(last_hidden_state) == 0] = 0
        y_tensor[expanded_attention_mask == 0] = -1
        
        # Store precomputed batch in cache
        self.X_cache = last_hidden_state
        self.y_cache = y_tensor

    def __len__(self):
        return self.dataset_length
    
    def __getitem__(self, idx):
        batch_idx = idx // self.precompute_batches
        batch_start_idx = batch_idx * self.precompute_batches
        cur_idx = idx % self.precompute_batches
        if self.total_iters % self.precompute_batches == 0:
            self._precompute_batches(batch_start_idx)
        self.total_iters += 1

        if cur_idx >= len(self.X_cache):
            cur_idx = idx % len(self.X_cache)

        X = self.X_cache[cur_idx]
        y = self.y_cache[cur_idx]
        return X, y

In [4]:
from sklearn.model_selection import train_test_split

# Initialize tokenizer and model
TARGET_MODEL = "cardiffnlp/twitter-roberta-base-irony"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
pretrained_model = AutoModel.from_pretrained(TARGET_MODEL).to(device)
pretrained_model.eval()

# Assuming df is your DataFrame
train_df, test_df = train_test_split(df, stratify=df["overall"], test_size=0.2)
num_classes = 5
# Create datasets
trainset = PretrainedModelDataset(train_df, tokenizer, pretrained_model, num_classes, device, max_length=128)
testset = PretrainedModelDataset(test_df, tokenizer, pretrained_model, num_classes, device, max_length=128)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-irony were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-irony and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

In [5]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub
import torch

data_config = DataConfig(dataset_name = 'amazon_reviews', task_type='ordinal_regression', obs_shape=[pretrained_model.config.hidden_size], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(core_model = 'gpt', encoder_model = 'none')

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False) 

In [6]:
trainer_hub.train(trainset, testset)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[0/100][50/61][Time 25.07]
Unified LR across all optimizers: 0.0001995308238189185
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0169	Gen: 0.4286	Rec: 0.4285	E: 0.0238	R: 0.0233	P: 0.8258
--------------------Test Metrics------------------------
accuracy: 0.0766
precision: 0.0889
recall: 0.1731
f1_score: 0.0401



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[1/100][39/61][Time 24.44]
Unified LR across all optimizers: 0.00019907191565870155
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0044	Gen: 0.3694	Rec: 0.3691	E: 0.0070	R: 0.0062	P: 0.7192
--------------------Test Metrics------------------------
accuracy: 0.6922
precision: 0.2060
recall: 0.2457
f1_score: 0.2157



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[2/100][28/61][Time 24.30]
Unified LR across all optimizers: 0.00019861406295796434
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0031	Gen: 0.3461	Rec: 0.3459	E: 0.0046	R: 0.0043	P: 0.6745
--------------------Test Metrics------------------------
accuracy: 0.7531
precision: 0.2204
recall: 0.2468
f1_score: 0.2315



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[3/100][17/61][Time 24.46]
Unified LR across all optimizers: 0.00019815726328921765
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0021	Gen: 0.3270	Rec: 0.3269	E: 0.0032	R: 0.0031	P: 0.6382
--------------------Test Metrics------------------------
accuracy: 0.7281
precision: 0.2149
recall: 0.2183
f1_score: 0.2165



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[4/100][6/61][Time 24.42]
Unified LR across all optimizers: 0.00019770151423055492
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0016	Gen: 0.3154	Rec: 0.3154	E: 0.0024	R: 0.0023	P: 0.6170
--------------------Test Metrics------------------------
accuracy: 0.7797
precision: 0.2223
recall: 0.2477
f1_score: 0.2339

[4/100][56/61][Time 24.39]
Unified LR across all optimizers: 0.00019724681336564005
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0013	Gen: 0.3050	Rec: 0.3049	E: 0.0020	R: 0.0020	P: 0.5963
--------------------Test Metrics------------------------
accuracy: 0.7312
precision: 0.2133
recall: 0.2497
f1_score: 0.2232



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]