In [2]:
import torch
if torch.cuda.is_available():
    print('GPU is available for model fine tuning.')
else:
    print('GPU is not available for model fine tuning. Look into this matter before continuing.')

GPU is available for model fine tuning.


# Generating csv dataset for fine tuning

In [None]:
import sys
from pathlib import Path

# Use the current working directory as a reference for the parent directory
current_directory = Path.cwd()
parent_directory = current_directory.parent

# Add parent directory to sys.path
sys.path.append(str(parent_directory))

import constants
from db.sql_db import DB
from llm import LLM

db = DB(database_location='../db/sqlite_storage/main.db')
llm = LLM(db, load_model_data_on_start=False)
llm.generate_dataset_for_llm_fine_tuning('./data.csv')

  from .autonotebook import tqdm as notebook_tqdm


Collection 'product' already exists.
2024-11-24 22:09:50,338 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-11-24 22:09:50,351 INFO sqlalchemy.engine.Engine SELECT recommendationfeedback.recommendation_feedback_id, recommendationfeedback.recommendation_id, recommendationfeedback.user_id, recommendationfeedback.rating, recommendationfeedback.created_at 
FROM recommendationfeedback 
WHERE recommendationfeedback.rating >= ?
2024-11-24 22:09:50,351 INFO sqlalchemy.engine.Engine [generated in 0.00049s] (3,)
2024-11-24 22:09:50,354 INFO sqlalchemy.engine.Engine ROLLBACK
2024-11-24 22:09:50,354 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-11-24 22:09:50,354 INFO sqlalchemy.engine.Engine SELECT recommendation.recommendation_id, recommendation.user_id, recommendation.product_id, recommendation.score, recommendation.created_at 
FROM recommendation 
WHERE recommendation.recommendation_id = ?
2024-11-24 22:09:50,359 INFO sqlalchemy.engine.Engine [generated in 0.00058s] (1,)
2024-11-24 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LEGION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


2024-11-24 22:09:50,572 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-11-24 22:09:50,573 INFO sqlalchemy.engine.Engine SELECT product.product_id, product.category_id, product.name, product.description, product.price, product.stock, product.created_at, product.updated_at 
FROM product 
WHERE product.product_id IN (SELECT 1 FROM (SELECT 1) WHERE 1!=1)
2024-11-24 22:09:50,573 INFO sqlalchemy.engine.Engine [generated in 0.00064s] ()
2024-11-24 22:09:50,574 INFO sqlalchemy.engine.Engine ROLLBACK
2024-11-24 22:09:50,575 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-11-24 22:09:50,576 INFO sqlalchemy.engine.Engine SELECT recommendation.recommendation_id, recommendation.user_id, recommendation.product_id, recommendation.score, recommendation.created_at 
FROM recommendation 
WHERE recommendation.recommendation_id = ?
2024-11-24 22:09:50,576 INFO sqlalchemy.engine.Engine [cached since 0.2184s ago] (2,)
2024-11-24 22:09:50,577 INFO sqlalchemy.engine.Engine ROLLBACK
2024-11-24 22:09:50

# Data Preparation

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv('./data.csv')
df.output = df.output.astype('str')
df

Unnamed: 0,input,output
0,Based on the customer profile and search histo...,"11,Novel Book"
1,Based on the customer profile and search histo...,"13,Cookbook"
2,Based on the customer profile and search histo...,"27,Xbox Series X"
3,Based on the customer profile and search histo...,"29,VR Headset"
4,Based on the customer profile and search histo...,"28,Gaming Chair"
...,...,...
579,Based on the customer profile and search histo...,"18,Drums"
580,Based on the customer profile and search histo...,"39,Camera"
581,Based on the customer profile and search histo...,"15,Children's Book"
582,Based on the customer profile and search histo...,"5,Tablet"


In [8]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)
(train_df.shape, valid_df.shape)

((467, 2), (117, 2))

In [9]:
# Convert dataframes to Dataset objects (for use in Hugging Face model)
import datasets
from datasets import Dataset

dataset_train = Dataset.from_pandas(train_df)
dataset_eval = Dataset.from_pandas(valid_df)
    
data_dict_dataset = datasets.DatasetDict({"train": dataset_train, "eval": dataset_eval})
data_dict_dataset
     


DatasetDict({
    train: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 467
    })
    eval: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 117
    })
})

In [12]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("t5-small")
# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

MODEL_NAME = constants.LLM_NAME

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [13]:
max_input_length = tokenizer.model_max_length
max_target_length = 30  # Adjust as needed

def preprocess_function(examples):
    # Tokenize inputs (e.g., customer profile and search history)
    inputs = [doc for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)

    # Tokenize targets (e.g., recommended product)
    targets = [doc for doc in examples["output"]]
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding=True)

    # Assign the tokenized labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Test preprocessing on first 2 rows
preprocess_function(data_dict_dataset["train"][:2])


{'input_ids': [[6719, 30, 8, 884, 3278, 11, 960, 892, 6, 1568, 8, 167, 2193, 556, 28, 8, 1910, 10, 843, 834, 23, 26, 6, 15892, 834, 4350, 45, 205, 9853, 26483, 21254, 5652, 4083, 6657, 329, 14920, 8015, 666, 10, 7327, 14226, 10, 27, 333, 7081, 11, 7868, 1277, 5, 4769, 5528, 10, 309, 5937, 7708, 7, 6, 1649, 7, 23, 8389, 4483, 7, 6, 4051, 18613, 6, 517, 265, 53, 23955, 6, 3174, 17, 2455, 21957, 6, 308, 5937, 7708, 7, 6, 25838, 5880, 6, 3174, 17, 2455, 21957, 6, 4051, 18613, 6, 1649, 7, 23, 8389, 4483, 7, 6, 1649, 7, 23, 8389, 4483, 7, 6, 3174, 17, 2455, 21957, 6, 4051, 18613, 6, 308, 5937, 7708, 7, 6, 23617, 1976, 205, 9853, 26483, 21254, 5652, 4083, 6657, 329, 14920, 8015, 10, 3, 2, 536, 10, 3, 31, 24656, 6399, 31, 6, 204, 10, 3, 31, 3612, 102, 2916, 31, 6, 220, 10, 3, 31, 24656, 9237, 31, 6, 314, 10, 3, 31, 13601, 15, 924, 262, 291, 11073, 7, 31, 6, 305, 10, 3, 31, 20354, 17, 31, 6, 431, 10, 3, 31, 308, 5937, 7708, 7, 31, 6, 489, 10, 3, 31, 382, 5236, 12415, 31, 6, 505, 10, 3, 31, 476,

In [14]:
# Tokenize train and eval datasets
tokenized_datasets = data_dict_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/467 [00:00<?, ? examples/s]

Map: 100%|██████████| 467/467 [00:00<00:00, 1240.67 examples/s]
Map: 100%|██████████| 117/117 [00:00<00:00, 1171.20 examples/s]


In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 467
    })
    eval: Dataset({
        features: ['input', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 117
    })
})

In [16]:
# Instantiate Data Loader for train and eval sets
# Adjust batch sizes as necessary

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=10, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["eval"], batch_size=10, collate_fn=data_collator
)
     

len(train_dataloader)
     


47

# Fine tuning

In [17]:
### Select Optimizer (for regularization)

from transformers import get_scheduler
from torch.optim import AdamW

learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

235


In [18]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Instantiate training arguments object
batch_size = 4
args = Seq2SeqTrainingArguments(
    "./t5_recommendation",
    # push_to_hub=True, # Comment out if you don't want to push to Hugging Face Hub
    eval_strategy = "epoch",
    learning_rate = 1e-4,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    save_total_limit = 3,
    num_train_epochs = 10, # Try 5-10 epochs; results may vary
    predict_with_generate = True,
    gradient_accumulation_steps = 4,
    eval_accumulation_steps = 4,
)
   

In [19]:
import evaluate  # Import the 'evaluate' library instead of 'datasets'

# Load your evaluation metric (e.g., ROUGE, BLEU, etc.)
metric = evaluate.load("rouge")  # Example: for ROUGE
metric

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [20]:
# Functions for further preprocessing and metrics computation
import numpy as np

def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [[label.strip()] for label in labels]

  return preds, labels

def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
    preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  # Replace -100 in the labes as we can't decode them.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Some simple post processing
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  result = metric.compute(predictions = decoded_preds, references = decoded_labels)
  # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  result = {key: value * 100 for key, value in result.items()}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)

  return result

In [21]:
# Instantiate Trainer object (for fine-tuning)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["eval"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
    optimizers = (optimizer, lr_scheduler)
)

  trainer = Seq2SeqTrainer(


In [22]:
# Train time should take a few minutes or less if on GPU
# Can take up to several hours if on CPU
trainer.train()

  0%|          | 0/290 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated.

{'eval_loss': 3.037708044052124, 'eval_rouge1': 2.9914529914529915, 'eval_rouge2': 0.0, 'eval_rougeL': 2.9914529914529915, 'eval_rougeLsum': 2.9914529914529915, 'eval_gen_len': 4.051282051282051, 'eval_runtime': 3.4986, 'eval_samples_per_second': 33.442, 'eval_steps_per_second': 8.575, 'epoch': 0.99}


 20%|██        | 58/290 [00:29<01:39,  2.32it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Traine

{'eval_loss': 2.1880745887756348, 'eval_rouge1': 17.94871794871795, 'eval_rouge2': 17.94871794871795, 'eval_rougeL': 17.94871794871795, 'eval_rougeLsum': 17.94871794871795, 'eval_gen_len': 8.0, 'eval_runtime': 5.2773, 'eval_samples_per_second': 22.17, 'eval_steps_per_second': 5.685, 'epoch': 1.98}


 30%|███       | 87/290 [00:46<01:25,  2.38it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Traine

{'eval_loss': 1.5361202955245972, 'eval_rouge1': 18.37606837606838, 'eval_rouge2': 17.94871794871795, 'eval_rougeL': 18.37606837606838, 'eval_rougeLsum': 18.37606837606838, 'eval_gen_len': 7.948717948717949, 'eval_runtime': 4.7534, 'eval_samples_per_second': 24.614, 'eval_steps_per_second': 6.311, 'epoch': 2.97}


 40%|████      | 117/290 [01:04<01:13,  2.35it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 1.198074460029602, 'eval_rouge1': 16.794871794871796, 'eval_rouge2': 14.529914529914532, 'eval_rougeL': 16.58119658119658, 'eval_rougeLsum': 16.752136752136753, 'eval_gen_len': 7.6581196581196584, 'eval_runtime': 5.1277, 'eval_samples_per_second': 22.817, 'eval_steps_per_second': 5.851, 'epoch': 4.0}


 50%|█████     | 146/290 [01:22<01:02,  2.31it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.9742584228515625, 'eval_rouge1': 18.37606837606838, 'eval_rouge2': 17.94871794871795, 'eval_rougeL': 18.37606837606838, 'eval_rougeLsum': 18.37606837606838, 'eval_gen_len': 7.948717948717949, 'eval_runtime': 5.0262, 'eval_samples_per_second': 23.278, 'eval_steps_per_second': 5.969, 'epoch': 4.99}


 60%|██████    | 175/290 [01:39<00:48,  2.36it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.8380650281906128, 'eval_rouge1': 18.37606837606838, 'eval_rouge2': 17.94871794871795, 'eval_rougeL': 18.37606837606838, 'eval_rougeLsum': 18.37606837606838, 'eval_gen_len': 7.965811965811966, 'eval_runtime': 4.7541, 'eval_samples_per_second': 24.61, 'eval_steps_per_second': 6.31, 'epoch': 5.98}


 70%|███████   | 204/290 [01:57<00:36,  2.37it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.7561094164848328, 'eval_rouge1': 18.37606837606838, 'eval_rouge2': 17.94871794871795, 'eval_rougeL': 18.37606837606838, 'eval_rougeLsum': 18.37606837606838, 'eval_gen_len': 7.965811965811966, 'eval_runtime': 5.0409, 'eval_samples_per_second': 23.21, 'eval_steps_per_second': 5.951, 'epoch': 6.97}


 81%|████████  | 234/290 [02:14<00:23,  2.40it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.7298159003257751, 'eval_rouge1': 18.37606837606838, 'eval_rouge2': 17.94871794871795, 'eval_rougeL': 18.37606837606838, 'eval_rougeLsum': 18.37606837606838, 'eval_gen_len': 7.948717948717949, 'eval_runtime': 4.7615, 'eval_samples_per_second': 24.572, 'eval_steps_per_second': 6.301, 'epoch': 8.0}


 91%|█████████ | 263/290 [02:33<00:22,  1.20it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.729771077632904, 'eval_rouge1': 18.37606837606838, 'eval_rouge2': 17.94871794871795, 'eval_rougeL': 18.37606837606838, 'eval_rougeLsum': 18.37606837606838, 'eval_gen_len': 7.948717948717949, 'eval_runtime': 11.2439, 'eval_samples_per_second': 10.406, 'eval_steps_per_second': 2.668, 'epoch': 8.99}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.729771077632904, 'eval_rouge1': 18.37606837606838, 'eval_rouge2': 17.94871794871795, 'eval_rougeL': 18.37606837606838, 'eval_rougeLsum': 18.37606837606838, 'eval_gen_len': 7.948717948717949, 'eval_runtime': 4.9934, 'eval_samples_per_second': 23.431, 'eval_steps_per_second': 6.008, 'epoch': 9.91}
{'train_runtime': 183.8173, 'train_samples_per_second': 25.406, 'train_steps_per_second': 1.578, 'train_loss': 2.2622106091729526, 'epoch': 9.91}





TrainOutput(global_step=290, training_loss=2.2622106091729526, metrics={'train_runtime': 183.8173, 'train_samples_per_second': 25.406, 'train_steps_per_second': 1.578, 'total_flos': 860858927284224.0, 'train_loss': 2.2622106091729526, 'epoch': 9.914529914529915})

In [23]:
# Save the trained model
model.save_pretrained(constants.LLM_FINE_TUNED_SAVE_PATH.split('/')[-1])

In [24]:
# Save the tokenizer (vocab, special tokens, etc.)
tokenizer.save_pretrained(constants.LLM_FINE_TUNED_TOKENIZER_PATH.split('/')[-1])

('fine-tuned-tokenizer\\tokenizer_config.json',
 'fine-tuned-tokenizer\\special_tokens_map.json',
 'fine-tuned-tokenizer\\spiece.model',
 'fine-tuned-tokenizer\\added_tokens.json')

# Collect evaluation data predictions

In [25]:
%%time
# Try predictions on validation set for confirmation
predictions = trainer.predict(tokenized_datasets["eval"])

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  0%|          | 0/30 [00:00<?, ?it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenize

CPU times: total: 1.22 s
Wall time: 5.41 s





In [26]:
predictions

PredictionOutput(predictions=array([[   0, 8537,    6, ...,    0,    0,    0],
       [   0, 8537,    6, ...,    0,    0,    0],
       [   0, 8537,    6, ...,    0,    0,    0],
       ...,
       [   0, 8537,    6, ...,    0,    0,    0],
       [   0, 8537,    6, ...,    0,    0,    0],
       [   0, 8537,    6, ...,    0,    0,    0]]), label_ids=array([[11940,  9105,    23, ...,     0,     0,     0],
       [12370,   553,    23, ...,     0,     0,     0],
       [11558, 23055, 23105, ...,     0,     0,     0],
       ...,
       [ 3479,     6,   134, ...,     0,     0,     0],
       [ 8580,   308,  5937, ...,     0,     0,     0],
       [14141,     4,  2689, ...,     0,     0,     0]]), metrics={'test_loss': 0.729771077632904, 'test_rouge1': 18.37606837606838, 'test_rouge2': 17.94871794871795, 'test_rougeL': 18.37606837606838, 'test_rougeLsum': 18.37606837606838, 'test_gen_len': 7.948717948717949, 'test_runtime': 5.4037, 'test_samples_per_second': 21.652, 'test_steps_per_second'

In [28]:
# Convert tokens from data to text
def translate(tokens):
  my_list = tokenizer.convert_ids_to_tokens(tokens)
  new_list = [token for token in my_list if ('<' not in token)] # token != '' and token != ''
  new_string = ''.join(new_list)
  new_string = new_string.replace("▁", " ")
  # new_string = new_string.replace("_", " ")
  new_string = new_string.strip()
  return new_string
     

# Print sample predicted output
index = 31
print(tokenized_datasets["eval"]["input"][index])
print("Target product: ", tokenized_datasets["eval"]["output"][index])
print("Recommended product: ", translate(predictions.predictions[index]))
     


Based on the customer profile and search history, recommend the most relevant product with the format: key_id,product_name from CANDIDATES FOR RECOMMENDATION below:
Customer Profile: 
Search History: Suitcase,Dumbbells,Running Shoes,Sunglasses,Headphones,Phone Case,Protein Powder,Camera,Cooking Tools,Cooking Tools,Travel Adapter,Wireless Earbuds,Microwave
CANDIDATES FOR RECOMMENDATION: {1: 'Smartphone', 2: 'Laptop', 3: 'Smartwatch', 4: 'Wireless Earbuds', 5: 'Tablet', 6: 'Dumbbells', 7: 'Treadmill', 8: 'Yoga Mat', 9: 'Resistance Bands', 10: 'Protein Powder', 11: 'Novel Book', 12: 'Textbook', 13: 'Cookbook', 14: 'Biography', 15: "Children's Book", 16: 'Guitar', 17: 'Piano', 18: 'Drums', 19: 'Violin', 20: 'Headphones', 21: 'Jacket', 22: 'Sneakers', 23: 'Watch', 24: 'Handbag', 25: 'Sunglasses', 26: 'PlayStation 5', 27: 'Xbox Series X', 28: 'Gaming Chair', 29: 'VR Headset', 30: 'Game Controller', 31: 'Sofa', 32: 'Dining Table', 33: 'Lamp', 34: 'Coffee Table', 35: 'Wall Art', 36: 'Backpack'

In [29]:
# Collect generated outputs and join with prompts and targets
model_generated = []
prompt_list = []
target_list = []

for i in range(len(predictions.predictions)):
  model_generated.append(translate(predictions.predictions[i]))

  prompt_list.append(dataset_eval['input'][i])
  target_list.append(dataset_eval['output'][i])
     

df_target_and_generated = pd.DataFrame()

df_target_and_generated['input'] = prompt_list
df_target_and_generated['target'] = target_list
df_target_and_generated['model_generated'] = model_generated

df_target_and_generated.to_csv('fine-tune-result.csv', index=False)

df_target_and_generated


Unnamed: 0,input,target,model_generated
0,Based on the customer profile and search histo...,"16,Guitar","44,Baseball Glove"
1,Based on the customer profile and search histo...,"19,Violin","44,Baseball Glove"
2,Based on the customer profile and search histo...,"30,Game Controller","44,Baseball Glove"
3,Based on the customer profile and search histo...,"7,Treadmill","44,Baseball Glove"
4,Based on the customer profile and search histo...,"44,Baseball Glove","44,Baseball Glove"
...,...,...,...
112,Based on the customer profile and search histo...,"29,VR Headset","44,Baseball Glove"
113,Based on the customer profile and search histo...,"15,Children's Book","44,Baseball Glove"
114,Based on the customer profile and search histo...,"45,Soccer Cleats","44,Baseball Glove"
115,Based on the customer profile and search histo...,"6,Dumbbells","44,Baseball Glove"


# Inferencing

In [30]:
model_inference = T5ForConditionalGeneration.from_pretrained(constants.LLM_FINE_TUNED_SAVE_PATH.split('/')[-1])

In [32]:
import torch

def generate_text(input_query, model, tokenizer, max_length=30):
    """
    Function to generate text using a fine-tuned T5 model.
    
    Parameters:
    - input_query (str): The input text for the model to process.
    - model: The pre-trained T5 model.
    - tokenizer: The tokenizer used for encoding the input and decoding the output.
    - max_length (int): Maximum length of the generated text.

    Returns:
    - str: The generated output text.
    """
    
    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Move model to the same device as the input
    model.to(device)
    
    # Tokenize the input query
    inputs = tokenizer(input_query, return_tensors="pt").to(device)
    
    # Set model to evaluation mode
    model.eval()
    
    # Disable gradient calculation during inference
    with torch.no_grad():
        # Generate text using the model
        outputs = model.generate(
            inputs['input_ids'], 
            max_length=max_length, 
            num_beams=5,  # You can adjust this for more diverse text
            early_stopping=True
        )
        
    # Decode the output tokens into text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text

# Example usage:
input_query = df_target_and_generated.iloc[0, 0]
generated_text = generate_text(input_query, model_inference, tokenizer)
generated_text

Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors


'18,Drums'