'''
Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JoengYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.
'''

In [1]:
import sys
path_append = "../../"
sys.path.append(path_append)  # Go up one directory from where you are.

from nn.utils.init import set_random_seed
set_random_seed(0)

In [2]:
# https://www.kaggle.com/datasets/tarkkaanko/amazon/

import pandas as pd
df = pd.read_csv(path_append + '../data/Amazon reviews/amazon_reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel

class PretrainedModelDataset(Dataset):
    def __init__(self, df, tokenizer, model, num_classes, device, max_length=512, precompute_batches=64, **kwargs):
        self.df = df.copy()
        self.df['overall'] -= 1  # Shift labels to 0-4 range
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_classes = num_classes
        self.device = device
        self.precompute_batches = precompute_batches
        self.X_cache = None
        self.y_cache = None
        self.total_iters = 0
        self.dataset_length = len(self.df)
        # Efficiently select a random subset of indices
        self._shuffle_indices()
        self._precompute_batches(0)
        
    def _shuffle_indices(self):
        self.batch_indices = torch.randperm(len(self.df))
        
    def _precompute_batches(self, start_idx):
        end_idx = min(start_idx + self.precompute_batches, self.dataset_length)
        batch_indices = self.batch_indices[start_idx:end_idx].tolist()

        # Gather batch data
        X_batch = [self.df.iloc[i]["reviewText"] for i in batch_indices]
        y_batch = [self.df.iloc[i]["overall"] for i in batch_indices]

        # Ensure all elements in X_batch are strings
        X_batch = [str(x) for x in X_batch]

        # Ensure y_batch contains valid numeric values
        y_batch = [float(y) for y in y_batch]
        
        # Tokenize the batch
        X = self.tokenizer(X_batch, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        # Move inputs to the correct device
        input_ids = X['input_ids'].to(self.device)
        attention_mask = X['attention_mask'].to(self.device)

        # Get the last hidden state from the RoBERTa model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.hidden_states[0]
            
        masked_hidden_states = last_hidden_state * attention_mask.unsqueeze(-1)
        # Reshape the labels for the entire batch
        y_tensor = torch.tensor(y_batch, dtype=torch.float).unsqueeze(1)  # Shape: (batch_size, 1)
        y_tensor = y_tensor.unsqueeze(1).repeat(1, last_hidden_state.size(1), 1)  # Shape: (batch_size, sequence_length, num_classes)
        y_tensor[attention_mask.unsqueeze(-1) > 0] = -1
        
        # Store precomputed batch in cache
        self.X_cache = masked_hidden_states
        self.y_cache = y_tensor

    def __len__(self):
        return self.dataset_length
    
    def __getitem__(self, idx):
        batch_idx = idx // self.precompute_batches
        batch_start_idx = batch_idx * self.precompute_batches
        cur_idx = idx % self.precompute_batches

        if self.total_iters % self.dataset_length == 0:
            self._shuffle_indices()

        if self.total_iters % self.precompute_batches == 0:
            self._precompute_batches(batch_start_idx)

        self.total_iters += 1

        if cur_idx >= len(self.X_cache):
            cur_idx = idx % len(self.X_cache)

        X = self.X_cache[cur_idx]
        y = self.y_cache[cur_idx]
        return X, y

In [4]:
from sklearn.model_selection import train_test_split
from transformers import AutoModel, AutoConfig
# Initialize tokenizer and model
TARGET_MODEL = "cardiffnlp/twitter-roberta-base-irony"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
# Assuming the model and tokenizer are already loaded correctly
config = AutoConfig.from_pretrained(TARGET_MODEL, output_hidden_states=True)
pretrained_model = AutoModel.from_pretrained(TARGET_MODEL, config=config)
pretrained_model.to(device)
pretrained_model.eval()

# Assuming df is your DataFrame
train_df, test_df = train_test_split(df, stratify=df["overall"], test_size=0.2)
num_classes = 1
# Create datasets
trainset = PretrainedModelDataset(train_df, tokenizer, pretrained_model, num_classes, device, max_length=128)
testset = PretrainedModelDataset(test_df, tokenizer, pretrained_model, num_classes, device, max_length=128)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-irony were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-irony and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

In [5]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub
import torch

data_config = DataConfig(dataset_name = 'amazon_reviews', task_type='regression', obs_shape=[pretrained_model.config.hidden_size], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(ccnet_network = 'gpt', encoder_network = 'none')

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False) 

In [6]:
trainer_hub.train(trainset, testset)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[0/100][50/61][Time 25.14]
Unified LR across all optimizers: 0.0001995308238189185
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0226	Gen: 0.1570	Rec: 0.1557	E: 0.0239	R: 0.0212	P: 0.2901
--------------------Test Metrics------------------------
mse: 4.2345
mae: 1.0015
r2: -0.6551



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[1/100][39/61][Time 24.58]
Unified LR across all optimizers: 0.00019907191565870155
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.0967	Rec: 0.0966	E: 0.0003	R: 0.0002	P: 0.1931
--------------------Test Metrics------------------------
mse: 4.7987
mae: 1.0904
r2: -0.7910



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[2/100][28/61][Time 24.45]
Unified LR across all optimizers: 0.00019861406295796434
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.0959	Rec: 0.0959	E: 0.0002	R: 0.0001	P: 0.1916
--------------------Test Metrics------------------------
mse: 3.9484
mae: 0.9371
r2: -0.5050



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[3/100][17/61][Time 24.50]
Unified LR across all optimizers: 0.00019815726328921765
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.0917	Rec: 0.0917	E: 0.0002	R: 0.0001	P: 0.1833
--------------------Test Metrics------------------------
mse: 5.8245
mae: 1.3953
r2: -1.0727



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[4/100][6/61][Time 24.54]
Unified LR across all optimizers: 0.00019770151423055492
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0902	Rec: 0.0901	E: 0.0002	R: 0.0001	P: 0.1802
--------------------Test Metrics------------------------
mse: 3.6596
mae: 0.8929
r2: -0.2932

[4/100][56/61][Time 24.47]
Unified LR across all optimizers: 0.00019724681336564005
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0879	Rec: 0.0879	E: 0.0001	R: 0.0001	P: 0.1757
--------------------Test Metrics------------------------
mse: 1.0243
mae: 0.5850
r2: 0.6427



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[5/100][45/61][Time 24.59]
Unified LR across all optimizers: 0.00019679315828369438
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0848	Rec: 0.0848	E: 0.0001	R: 0.0001	P: 0.1694
--------------------Test Metrics------------------------
mse: 0.7503
mae: 0.3393
r2: 0.7183



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[6/100][34/61][Time 24.54]
Unified LR across all optimizers: 0.00019634054657948372
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0812	Rec: 0.0811	E: 0.0001	R: 0.0001	P: 0.1622
--------------------Test Metrics------------------------
mse: 0.6480
mae: 0.2872
r2: 0.7694



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[7/100][23/61][Time 24.48]
Unified LR across all optimizers: 0.00019588897585330582
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0769	Rec: 0.0769	E: 0.0001	R: 0.0000	P: 0.1538
--------------------Test Metrics------------------------
mse: 0.7733
mae: 0.3063
r2: 0.7359



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[8/100][12/61][Time 24.50]
Unified LR across all optimizers: 0.00019543844371097777
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0768	Rec: 0.0768	E: 0.0001	R: 0.0000	P: 0.1535
--------------------Test Metrics------------------------
mse: 0.5853
mae: 0.2631
r2: 0.7805



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[9/100][1/61][Time 24.43]
Unified LR across all optimizers: 0.00019498894776382288
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0729	Rec: 0.0729	E: 0.0001	R: 0.0000	P: 0.1458
--------------------Test Metrics------------------------
mse: 0.8038
mae: 0.2990
r2: 0.7195

[9/100][51/61][Time 24.50]
Unified LR across all optimizers: 0.00019454048562865856
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0712	Rec: 0.0712	E: 0.0001	R: 0.0000	P: 0.1423
--------------------Test Metrics------------------------
mse: 0.7403
mae: 0.3048
r2: 0.7481



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[10/100][40/61][Time 24.57]
Unified LR across all optimizers: 0.00019409305492778308
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0713	Rec: 0.0713	E: 0.0001	R: 0.0000	P: 0.1425
--------------------Test Metrics------------------------
mse: 0.4320
mae: 0.2175
r2: 0.8446



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[11/100][29/61][Time 24.48]
Unified LR across all optimizers: 0.00019364665328896346
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0673	Rec: 0.0673	E: 0.0001	R: 0.0001	P: 0.1345
--------------------Test Metrics------------------------
mse: 0.5826
mae: 0.2483
r2: 0.7994



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[12/100][18/61][Time 24.68]
Unified LR across all optimizers: 0.00019320127834542263
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0674	Rec: 0.0673	E: 0.0001	R: 0.0000	P: 0.1346
--------------------Test Metrics------------------------
mse: 0.7422
mae: 0.2998
r2: 0.7362



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[13/100][7/61][Time 24.39]
Unified LR across all optimizers: 0.00019275692773582703
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0666	Rec: 0.0666	E: 0.0001	R: 0.0000	P: 0.1332
--------------------Test Metrics------------------------
mse: 0.7531
mae: 0.3023
r2: 0.7227

[13/100][57/61][Time 23.45]
Unified LR across all optimizers: 0.0001923135991042739
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.0660	Rec: 0.0660	E: 0.0000	R: 0.0000	P: 0.1320
--------------------Test Metrics------------------------
mse: 0.5588
mae: 0.2574
r2: 0.7901



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[14/100][46/61][Time 24.74]
Unified LR across all optimizers: 0.0001918712901002789
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.0616	Rec: 0.0616	E: 0.0001	R: 0.0000	P: 0.1232
--------------------Test Metrics------------------------
mse: 0.7000
mae: 0.2943
r2: 0.7595



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[15/100][35/61][Time 24.97]
Unified LR across all optimizers: 0.00019142999837876384
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0603	Rec: 0.0603	E: 0.0001	R: 0.0000	P: 0.1206
--------------------Test Metrics------------------------
mse: 0.7528
mae: 0.3138
r2: 0.7372



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[16/100][24/61][Time 24.59]
Unified LR across all optimizers: 0.00019098972160004388
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0599	Rec: 0.0599	E: 0.0001	R: 0.0001	P: 0.1198
--------------------Test Metrics------------------------
mse: 0.7073
mae: 0.2927
r2: 0.7434



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[17/100][13/61][Time 24.56]
Unified LR across all optimizers: 0.00019055045742981543
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.0591	Rec: 0.0591	E: 0.0001	R: 0.0000	P: 0.1181
--------------------Test Metrics------------------------
mse: 0.6742
mae: 0.2728
r2: 0.7727



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[18/100][2/61][Time 24.60]
Unified LR across all optimizers: 0.00019011220353914353
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.0600	Rec: 0.0600	E: 0.0000	R: 0.0000	P: 0.1200
--------------------Test Metrics------------------------
mse: 0.6094
mae: 0.2585
r2: 0.7615

[18/100][52/61][Time 24.46]
Unified LR across all optimizers: 0.00018967495760444968
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.0575	Rec: 0.0575	E: 0.0001	R: 0.0000	P: 0.1150
--------------------Test Metrics------------------------
mse: 0.7293
mae: 0.2945
r2: 0.7202



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]