'''
Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JoengYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.
'''

In [1]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

from nn.utils.init import set_random_seed
set_random_seed(0)

In [2]:
# https://www.kaggle.com/datasets/tarkkaanko/amazon/

import pandas as pd
df = pd.read_csv(path_append + '../data/Amazon reviews/amazon_reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel

class PretrainedModelDataset(Dataset):
    def __init__(self, df, tokenizer, model, num_classes, device, max_length=512, precompute_batches=64, **kwargs):
        self.df = df.copy()
        self.df['overall'] -= 1  # Shift labels to 0-4 range
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_classes = num_classes
        self.device = device
        self.precompute_batches = precompute_batches
        self.X_cache = None
        self.y_cache = None
        self.total_iters = 0
        self.dataset_length = len(self.df)
        # Efficiently select a random subset of indices
        self._shuffle_indices()
        self._precompute_batches(0)
        
    def _shuffle_indices(self):
        self.batch_indices = torch.randperm(len(self.df))
        
    def _precompute_batches(self, start_idx):
        end_idx = min(start_idx + self.precompute_batches, self.dataset_length)
        batch_indices = self.batch_indices[start_idx:end_idx].tolist()

        # Gather batch data
        X_batch = [self.df.iloc[i]["reviewText"] for i in batch_indices]
        y_batch = [self.df.iloc[i]["overall"] for i in batch_indices]

        # Ensure all elements in X_batch are strings
        X_batch = [str(x) for x in X_batch]

        # Ensure y_batch contains valid numeric values
        y_batch = [float(y) for y in y_batch]
        
        # Tokenize the batch
        X = self.tokenizer(X_batch, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        # Move inputs to the correct device
        input_ids = X['input_ids'].to(self.device)
        attention_mask = X['attention_mask'].to(self.device)

        # Get the last hidden state from the RoBERTa model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.last_hidden_state
            
        masked_hidden_states = last_hidden_state * attention_mask.unsqueeze(-1)
        # Reshape the labels for the entire batch
        y_tensor = torch.tensor(y_batch, dtype=torch.float).unsqueeze(1)  # Shape: (batch_size, 1)
        y_tensor = y_tensor.unsqueeze(1).repeat(1, last_hidden_state.size(1), 1)  # Shape: (batch_size, sequence_length, num_classes)
        y_tensor[attention_mask.unsqueeze(-1) > 0] = -1
        
        # Store precomputed batch in cache
        self.X_cache = masked_hidden_states
        self.y_cache = y_tensor

    def __len__(self):
        return self.dataset_length
    
    def __getitem__(self, idx):
        batch_idx = idx // self.precompute_batches
        batch_start_idx = batch_idx * self.precompute_batches
        cur_idx = idx % self.precompute_batches

        if self.total_iters % self.dataset_length == 0:
            self._shuffle_indices()

        if self.total_iters % self.precompute_batches == 0:
            self._precompute_batches(batch_start_idx)

        self.total_iters += 1

        if cur_idx >= len(self.X_cache):
            cur_idx = idx % len(self.X_cache)

        X = self.X_cache[cur_idx]
        y = self.y_cache[cur_idx]
        return X, y

In [4]:
from sklearn.model_selection import train_test_split

# Initialize tokenizer and model
TARGET_MODEL = "cardiffnlp/twitter-roberta-base-irony"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
pretrained_model = AutoModel.from_pretrained(TARGET_MODEL).to(device)
pretrained_model.eval()

# Assuming df is your DataFrame
train_df, test_df = train_test_split(df, stratify=df["overall"], test_size=0.2)
num_classes = 5
# Create datasets
trainset = PretrainedModelDataset(train_df, tokenizer, pretrained_model, num_classes, device, max_length=128)
testset = PretrainedModelDataset(test_df, tokenizer, pretrained_model, num_classes, device, max_length=128)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-irony were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-irony and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

In [5]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub
import torch

data_config = DataConfig(dataset_name = 'amazon_reviews', task_type='ordinal_regression', obs_shape=[pretrained_model.config.hidden_size], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(core_model = 'gpt', encoder_model = 'none')

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False) 

In [6]:
trainer_hub.train(trainset, testset)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[0/100][50/61][Time 25.23]
Unified LR across all optimizers: 0.0001995308238189185
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0091	Gen: 0.2341	Rec: 0.2341	E: 0.0091	R: 0.0091	P: 0.4590
--------------------Test Metrics------------------------
accuracy: 0.7500
precision: 0.1658
recall: 0.2000
f1_score: 0.1813



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[1/100][39/61][Time 24.63]
Unified LR across all optimizers: 0.00019907191565870155
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0003	Gen: 0.1669	Rec: 0.1668	E: 0.0004	R: 0.0002	P: 0.3334
--------------------Test Metrics------------------------
accuracy: 0.7328
precision: 0.1640
recall: 0.1992
f1_score: 0.1799



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[2/100][28/61][Time 24.44]
Unified LR across all optimizers: 0.00019861406295796434
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0003	Gen: 0.1651	Rec: 0.1650	E: 0.0004	R: 0.0002	P: 0.3298
--------------------Test Metrics------------------------
accuracy: 0.7422
precision: 0.1658
recall: 0.2000
f1_score: 0.1813



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[3/100][17/61][Time 24.46]
Unified LR across all optimizers: 0.00019815726328921765
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1575	Rec: 0.1574	E: 0.0003	R: 0.0002	P: 0.3147
--------------------Test Metrics------------------------
accuracy: 0.7391
precision: 0.1665
recall: 0.2000
f1_score: 0.1817



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[4/100][6/61][Time 24.52]
Unified LR across all optimizers: 0.00019770151423055492
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1533	Rec: 0.1533	E: 0.0001	R: 0.0001	P: 0.3064
--------------------Test Metrics------------------------
accuracy: 0.7391
precision: 0.1663
recall: 0.2000
f1_score: 0.1816

[4/100][56/61][Time 24.38]
Unified LR across all optimizers: 0.00019724681336564005
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1488	Rec: 0.1488	E: 0.0001	R: 0.0001	P: 0.2975
--------------------Test Metrics------------------------
accuracy: 0.7422
precision: 0.1678
recall: 0.2000
f1_score: 0.1825



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[5/100][45/61][Time 24.51]
Unified LR across all optimizers: 0.00019679315828369438
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1451	Rec: 0.1451	E: 0.0001	R: 0.0001	P: 0.2901
--------------------Test Metrics------------------------
accuracy: 0.7172
precision: 0.1602
recall: 0.2000
f1_score: 0.1779



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[6/100][34/61][Time 24.63]
Unified LR across all optimizers: 0.00019634054657948372
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1409	Rec: 0.1409	E: 0.0002	R: 0.0001	P: 0.2817
--------------------Test Metrics------------------------
accuracy: 0.7406
precision: 0.1672
recall: 0.2000
f1_score: 0.1821



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[7/100][23/61][Time 24.45]
Unified LR across all optimizers: 0.00019588897585330582
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1351	Rec: 0.1351	E: 0.0001	R: 0.0001	P: 0.2700
--------------------Test Metrics------------------------
accuracy: 0.7469
precision: 0.1689
recall: 0.2000
f1_score: 0.1831



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[8/100][12/61][Time 24.56]
Unified LR across all optimizers: 0.00019543844371097777
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1363	Rec: 0.1363	E: 0.0001	R: 0.0001	P: 0.2725
--------------------Test Metrics------------------------
accuracy: 0.7563
precision: 0.1695
recall: 0.2000
f1_score: 0.1835

