'''
Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JoengYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.
'''

In [1]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

from nn.utils.init import set_random_seed
set_random_seed(0)

In [2]:
# https://www.kaggle.com/datasets/tarkkaanko/amazon/

import pandas as pd
df = pd.read_csv(path_append + '../data/Amazon reviews/amazon_reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [3]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel

class PretrainedModelDataset(Dataset):
    def __init__(self, df, tokenizer, model, num_classes, device, max_length=512, precompute_batches=64, **kwargs):
        self.df = df.copy()
        self.model = model
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.num_classes = num_classes
        self.device = device
        self.precompute_batches = precompute_batches
        self.X_cache = None
        self.y_cache = None
        self.total_iters = 0
        self.dataset_length = len(self.df)
        # Efficiently select a random subset of indices
        self.batch_indices = torch.randperm(len(self.df))
        self._precompute_batches(0)
        
    def _precompute_batches(self, start_idx):
        end_idx = min(start_idx + self.precompute_batches, self.dataset_length)
        batch_indices = self.batch_indices[start_idx:end_idx].tolist()

        # Gather batch data
        X_batch = [self.df.iloc[i]["reviewText"] for i in batch_indices]
        y_batch = [self.df.iloc[i]["overall"] for i in batch_indices]

        # Ensure all elements in X_batch are strings
        X_batch = [str(x) for x in X_batch]

        # Ensure y_batch contains valid numeric values
        y_batch = [float(y) for y in y_batch]
        
        # Tokenize the batch
        X = self.tokenizer(X_batch, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt')
        
        # Move inputs to the correct device
        input_ids = X['input_ids'].to(self.device)
        attention_mask = X['attention_mask'].to(self.device)

        # Get the last hidden state from the RoBERTa model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            last_hidden_state = outputs.last_hidden_state

        # Reshape the labels for the entire batch
        y_tensor = torch.tensor(y_batch, dtype=torch.float).unsqueeze(1)  # Shape: (batch_size, 1)
        y_tensor = y_tensor.unsqueeze(1).repeat(1, last_hidden_state.size(1), 1)  # Shape: (batch_size, sequence_length, num_classes)
        # Store precomputed batch in cache
        self.X_cache = last_hidden_state
        self.y_cache = y_tensor

    def __len__(self):
        return self.dataset_length
    
    def __getitem__(self, idx):
        batch_idx = idx // self.precompute_batches
        batch_start_idx = batch_idx * self.precompute_batches
        cur_idx = idx % self.precompute_batches
        if self.total_iters % self.precompute_batches == 0:
            self._precompute_batches(batch_start_idx)
        self.total_iters += 1

        if cur_idx >= len(self.X_cache):
            cur_idx = idx % len(self.X_cache)

        X = self.X_cache[cur_idx]
        y = self.y_cache[cur_idx]
        return X, y

In [4]:
from sklearn.model_selection import train_test_split

# Initialize tokenizer and model
TARGET_MODEL = "cardiffnlp/twitter-roberta-base-irony"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(TARGET_MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
pretrained_model = AutoModel.from_pretrained(TARGET_MODEL).to(device)
pretrained_model.eval()

# Assuming df is your DataFrame
train_df, test_df = train_test_split(df, stratify=df["overall"], test_size=0.2)
num_classes = 1
# Create datasets
trainset = PretrainedModelDataset(train_df, tokenizer, pretrained_model, num_classes, device, max_length=128)
testset = PretrainedModelDataset(test_df, tokenizer, pretrained_model, num_classes, device, max_length=128)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-irony were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-irony and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

In [5]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub
import torch

data_config = DataConfig(dataset_name = 'amazon_reviews', task_type='regression', obs_shape=[pretrained_model.config.hidden_size], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(core_model = 'gpt', encoder_model = 'none')

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False) 

In [6]:
trainer_hub.train(trainset, testset)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[0/100][50/61][Time 24.83]
Unified LR across all optimizers: 0.0001995308238189185
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0044	Gen: 0.3810	Rec: 0.3811	E: 0.0044	R: 0.0045	P: 0.7576
--------------------Test Metrics------------------------
mse: 1.1372
mae: 0.5272
r2: -0.1733



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[1/100][39/61][Time 24.35]
Unified LR across all optimizers: 0.00019907191565870155
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.2999	Rec: 0.2999	E: 0.0000	R: 0.0000	P: 0.5998
--------------------Test Metrics------------------------
mse: 1.1066
mae: 0.5122
r2: -0.1416



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[2/100][28/61][Time 24.27]
Unified LR across all optimizers: 0.00019861406295796434
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.2760	Rec: 0.2760	E: 0.0000	R: 0.0000	P: 0.5520
--------------------Test Metrics------------------------
mse: 0.9779
mae: 0.4589
r2: -0.0089



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[3/100][17/61][Time 24.23]
Unified LR across all optimizers: 0.00019815726328921765
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.2628	Rec: 0.2628	E: 0.0000	R: 0.0000	P: 0.5255
--------------------Test Metrics------------------------
mse: 1.0061
mae: 0.4354
r2: -0.0380



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[4/100][6/61][Time 23.49]
Unified LR across all optimizers: 0.00019770151423055492
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.2541	Rec: 0.2541	E: 0.0000	R: 0.0000	P: 0.5081
--------------------Test Metrics------------------------
mse: 1.0294
mae: 0.4245
r2: -0.0621

[4/100][56/61][Time 23.17]
Unified LR across all optimizers: 0.00019724681336564005
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.2463	Rec: 0.2463	E: 0.0000	R: 0.0000	P: 0.4925
--------------------Test Metrics------------------------
mse: 1.0728
mae: 0.4295
r2: -0.1068



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[5/100][45/61][Time 24.10]
Unified LR across all optimizers: 0.00019679315828369438
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.2409	Rec: 0.2409	E: 0.0000	R: 0.0000	P: 0.4817
--------------------Test Metrics------------------------
mse: 1.1139
mae: 0.4584
r2: -0.1492



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[6/100][34/61][Time 24.76]
Unified LR across all optimizers: 0.00019634054657948372
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.2347	Rec: 0.2347	E: 0.0000	R: 0.0000	P: 0.4693
--------------------Test Metrics------------------------
mse: 1.1052
mae: 0.4421
r2: -0.1402



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[7/100][23/61][Time 24.35]
Unified LR across all optimizers: 0.00019588897585330582
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.2299	Rec: 0.2299	E: 0.0000	R: 0.0000	P: 0.4598
--------------------Test Metrics------------------------
mse: 1.0825
mae: 0.4257
r2: -0.1168



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[8/100][12/61][Time 24.41]
Unified LR across all optimizers: 0.00019543844371097777
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0000	Gen: 0.2269	Rec: 0.2269	E: 0.0000	R: 0.0000	P: 0.4537
--------------------Test Metrics------------------------
mse: 1.0276
mae: 0.4077
r2: -0.0602



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[9/100][1/61][Time 24.10]
Unified LR across all optimizers: 0.00019498894776382288
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.2215	Rec: 0.2215	E: 0.0001	R: 0.0001	P: 0.4430
--------------------Test Metrics------------------------
mse: 1.0087
mae: 0.4233
r2: -0.0406

[9/100][51/61][Time 24.19]
Unified LR across all optimizers: 0.00019454048562865856
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.2188	Rec: 0.2188	E: 0.0001	R: 0.0001	P: 0.4375
--------------------Test Metrics------------------------
mse: 1.0234
mae: 0.4057
r2: -0.0558



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[10/100][40/61][Time 24.30]
Unified LR across all optimizers: 0.00019409305492778308
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.2134	Rec: 0.2134	E: 0.0001	R: 0.0001	P: 0.4268
--------------------Test Metrics------------------------
mse: 0.9843
mae: 0.4083
r2: -0.0155



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[11/100][29/61][Time 24.40]
Unified LR across all optimizers: 0.00019364665328896346
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.2124	Rec: 0.2124	E: 0.0001	R: 0.0001	P: 0.4248
--------------------Test Metrics------------------------
mse: 0.9459
mae: 0.4230
r2: 0.0241



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[12/100][18/61][Time 23.27]
Unified LR across all optimizers: 0.00019320127834542263
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.2099	Rec: 0.2099	E: 0.0001	R: 0.0001	P: 0.4197
--------------------Test Metrics------------------------
mse: 0.9796
mae: 0.4250
r2: -0.0106



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[13/100][7/61][Time 24.01]
Unified LR across all optimizers: 0.00019275692773582703
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.2061	Rec: 0.2061	E: 0.0001	R: 0.0001	P: 0.4121
--------------------Test Metrics------------------------
mse: 0.9776
mae: 0.4381
r2: -0.0085

[13/100][57/61][Time 24.30]
Unified LR across all optimizers: 0.0001923135991042739
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.2053	Rec: 0.2054	E: 0.0001	R: 0.0002	P: 0.4106
--------------------Test Metrics------------------------
mse: 1.1368
mae: 0.4359
r2: -0.1728



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[14/100][46/61][Time 23.35]
Unified LR across all optimizers: 0.0001918712901002789
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.2024	Rec: 0.2024	E: 0.0001	R: 0.0001	P: 0.4048
--------------------Test Metrics------------------------
mse: 1.0990
mae: 0.4072
r2: -0.1338



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[15/100][35/61][Time 23.63]
Unified LR across all optimizers: 0.00019142999837876384
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1989	Rec: 0.1989	E: 0.0001	R: 0.0001	P: 0.3977
--------------------Test Metrics------------------------
mse: 1.0783
mae: 0.3985
r2: -0.1125



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[16/100][24/61][Time 24.70]
Unified LR across all optimizers: 0.00019098972160004388
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1991	Rec: 0.1991	E: 0.0001	R: 0.0001	P: 0.3980
--------------------Test Metrics------------------------
mse: 1.0931
mae: 0.4032
r2: -0.1277



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[17/100][13/61][Time 24.48]
Unified LR across all optimizers: 0.00019055045742981543
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1965	Rec: 0.1965	E: 0.0001	R: 0.0001	P: 0.3929
--------------------Test Metrics------------------------
mse: 1.0711
mae: 0.3986
r2: -0.1050



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[18/100][2/61][Time 24.39]
Unified LR across all optimizers: 0.00019011220353914353
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1942	Rec: 0.1942	E: 0.0001	R: 0.0001	P: 0.3883
--------------------Test Metrics------------------------
mse: 1.1054
mae: 0.4110
r2: -0.1404

[18/100][52/61][Time 24.32]
Unified LR across all optimizers: 0.00018967495760444968
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1944	Rec: 0.1944	E: 0.0001	R: 0.0001	P: 0.3886
--------------------Test Metrics------------------------
mse: 1.0502
mae: 0.4175
r2: -0.0835



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[19/100][41/61][Time 24.43]
Unified LR across all optimizers: 0.00018923871730749947
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1928	Rec: 0.1928	E: 0.0001	R: 0.0001	P: 0.3855
--------------------Test Metrics------------------------
mse: 1.0946
mae: 0.4015
r2: -0.1293



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[20/100][30/61][Time 24.38]
Unified LR across all optimizers: 0.00018880348033539028
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1899	Rec: 0.1899	E: 0.0001	R: 0.0001	P: 0.3796
--------------------Test Metrics------------------------
mse: 1.0959
mae: 0.4019
r2: -0.1306



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[21/100][19/61][Time 24.23]
Unified LR across all optimizers: 0.00018836924438053897
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1884	Rec: 0.1884	E: 0.0001	R: 0.0001	P: 0.3767
--------------------Test Metrics------------------------
mse: 1.1316
mae: 0.4421
r2: -0.1675



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[22/100][8/61][Time 24.27]
Unified LR across all optimizers: 0.0001879360071406698
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1875	Rec: 0.1875	E: 0.0001	R: 0.0001	P: 0.3749
--------------------Test Metrics------------------------
mse: 1.0849
mae: 0.3963
r2: -0.1193

[22/100][58/61][Time 23.45]
Unified LR across all optimizers: 0.000187503766318802
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0001	Gen: 0.1859	Rec: 0.1859	E: 0.0001	R: 0.0001	P: 0.3717
--------------------Test Metrics------------------------
mse: 1.1028
mae: 0.4100
r2: -0.1377



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[23/100][47/61][Time 23.26]
Unified LR across all optimizers: 0.00018707251962323787
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1844	Rec: 0.1844	E: 0.0002	R: 0.0002	P: 0.3687
--------------------Test Metrics------------------------
mse: 1.0866
mae: 0.3990
r2: -0.1210



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[24/100][36/61][Time 23.22]
Unified LR across all optimizers: 0.0001866422647675502
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1842	Rec: 0.1842	E: 0.0002	R: 0.0002	P: 0.3683
--------------------Test Metrics------------------------
mse: 1.0403
mae: 0.4153
r2: -0.0733



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[25/100][25/61][Time 23.23]
Unified LR across all optimizers: 0.00018621299947057073
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1849	Rec: 0.1849	E: 0.0002	R: 0.0002	P: 0.3696
--------------------Test Metrics------------------------
mse: 1.0755
mae: 0.3984
r2: -0.1096



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[26/100][14/61][Time 23.22]
Unified LR across all optimizers: 0.00018578472145637737
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1821	Rec: 0.1821	E: 0.0002	R: 0.0002	P: 0.3641
--------------------Test Metrics------------------------
mse: 1.0153
mae: 0.4341
r2: -0.0474



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[27/100][3/61][Time 24.87]
Unified LR across all optimizers: 0.00018535742845428288
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1813	Rec: 0.1813	E: 0.0002	R: 0.0002	P: 0.3625
--------------------Test Metrics------------------------
mse: 1.0745
mae: 0.4027
r2: -0.1085

[27/100][53/61][Time 24.27]
Unified LR across all optimizers: 0.00018493111819882223
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1803	Rec: 0.1803	E: 0.0002	R: 0.0002	P: 0.3604
--------------------Test Metrics------------------------
mse: 1.0939
mae: 0.4224
r2: -0.1286



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[28/100][42/61][Time 24.36]
Unified LR across all optimizers: 0.00018450578842974107
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1791	Rec: 0.1791	E: 0.0002	R: 0.0002	P: 0.3580
--------------------Test Metrics------------------------
mse: 1.0679
mae: 0.3958
r2: -0.1017



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[29/100][31/61][Time 24.44]
Unified LR across all optimizers: 0.00018408143689198318
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1792	Rec: 0.1792	E: 0.0002	R: 0.0002	P: 0.3582
--------------------Test Metrics------------------------
mse: 0.9499
mae: 0.3990
r2: 0.0200



Iterations:   0%|          | 0/61 [00:00<?, ?it/s]

[30/100][20/61][Time 24.43]
Unified LR across all optimizers: 0.0001836580613356789
--------------------Training Metrics--------------------
Trainer:  gpt
Inf: 0.0002	Gen: 0.1783	Rec: 0.1783	E: 0.0002	R: 0.0002	P: 0.3563
--------------------Test Metrics------------------------
mse: 0.5849
mae: 0.3493
r2: 0.3966

