In [1]:
import torch
print(torch.cuda.is_available())

True


In [109]:
import sys
from pathlib import Path

# Use the current working directory as a reference for the parent directory
current_directory = Path.cwd()
parent_directory = current_directory.parent

# Add parent directory to sys.path
sys.path.append(str(parent_directory))

# Import the constants module
import constants

In [110]:
import pandas as pd
from sklearn.model_selection import train_test_split


In [111]:
df = pd.read_csv('./data.csv')
df.output = df.output.astype('str')
df

Unnamed: 0,input,output
0,Based on the customer profile and search histo...,"50,Air Fryer"
1,Based on the customer profile and search histo...,"11,Novel Book"
2,Based on the customer profile and search histo...,"42,Basketball"
3,Based on the customer profile and search histo...,"38,Travel Pillow"
4,Based on the customer profile and search histo...,"11,Novel Book"
...,...,...
581,Based on the customer profile and search histo...,"40,Travel Adapter"
582,Based on the customer profile and search histo...,"17,Piano"
583,Based on the customer profile and search histo...,"3,Smartwatch"
584,Based on the customer profile and search histo...,"1,Smartphone"


In [112]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=42)
(train_df.shape, valid_df.shape)

((468, 2), (118, 2))

In [113]:
# Convert dataframes to Dataset objects (for use in Hugging Face model)
import datasets
from datasets import Dataset

dataset_train = Dataset.from_pandas(train_df)
dataset_eval = Dataset.from_pandas(valid_df)
    
data_dict_dataset = datasets.DatasetDict({"train": dataset_train, "eval": dataset_eval})
data_dict_dataset
     


DatasetDict({
    train: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 468
    })
    eval: Dataset({
        features: ['input', 'output', '__index_level_0__'],
        num_rows: 118
    })
})

In [114]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# tokenizer = AutoTokenizer.from_pretrained("t5-small")
# model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

MODEL_NAME = constants.LLM_NAME

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [115]:
max_input_length = tokenizer.model_max_length
max_target_length = 30  # Adjust as needed

def preprocess_function(examples):
    # Tokenize inputs (e.g., customer profile and search history)
    inputs = [doc for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding=True)

    # Tokenize targets (e.g., recommended product)
    targets = [doc for doc in examples["output"]]
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding=True)

    # Assign the tokenized labels to model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Test preprocessing on first 2 rows
preprocess_function(data_dict_dataset["train"][:2])


{'input_ids': [[6719, 30, 8, 884, 3278, 11, 960, 892, 6, 1568, 8, 167, 2193, 556, 28, 8, 1910, 10, 843, 834, 23, 26, 6, 15892, 834, 4350, 45, 205, 9853, 26483, 21254, 5652, 4083, 6657, 329, 14920, 8015, 666, 10, 7327, 14226, 10, 27, 333, 3875, 11, 4832, 5, 4769, 5528, 10, 25727, 7, 6, 434, 35, 7, 6, 371, 32, 32, 17, 3184, 6, 19310, 11410, 6, 254, 9, 935, 9, 6, 19310, 11410, 6, 371, 32, 32, 17, 3184, 6, 254, 9, 935, 9, 6, 14561, 782, 6, 3881, 32, 1765, 412, 324, 7, 1558, 6, 27529, 11664, 53, 4300, 6, 254, 9, 935, 9, 6, 3881, 32, 1765, 13581, 6, 134, 6174, 15, 7, 6, 19310, 11410, 6, 3881, 1825, 2567, 6, 134, 6174, 15, 7, 6, 3174, 17, 2455, 21957, 6, 134, 6174, 15, 7, 205, 9853, 26483, 21254, 5652, 4083, 6657, 329, 14920, 8015, 10, 3, 2, 3628, 10, 3, 31, 14885, 15, 3184, 9840, 162, 31, 6, 943, 10, 3, 31, 20162, 6248, 7975, 31, 2, 1], [6719, 30, 8, 884, 3278, 11, 960, 892, 6, 1568, 8, 167, 2193, 556, 28, 8, 1910, 10, 843, 834, 23, 26, 6, 15892, 834, 4350, 45, 205, 9853, 26483, 21254, 5652,

In [116]:
# Tokenize train and eval datasets
tokenized_datasets = data_dict_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 468/468 [00:00<00:00, 2051.21 examples/s]
Map: 100%|██████████| 118/118 [00:00<00:00, 2585.30 examples/s]


In [117]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 468
    })
    eval: Dataset({
        features: ['input', 'output', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 118
    })
})

In [118]:
# Instantiate Data Loader for train and eval sets
# Adjust batch sizes as necessary

from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=10, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["eval"], batch_size=10, collate_fn=data_collator
)
     

len(train_dataloader)
     


47

# Fine tuning

In [123]:
### Select Optimizer (for regularization)

from transformers import get_scheduler
from torch.optim import AdamW

learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

235


In [124]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

# Instantiate training arguments object
batch_size = 4
args = Seq2SeqTrainingArguments(
    "./t5_recommendation",
    # push_to_hub=True, # Comment out if you don't want to push to Hugging Face Hub
    eval_strategy = "epoch",
    learning_rate = 1e-4,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    save_total_limit = 3,
    num_train_epochs = 10, # Try 5-10 epochs; results may vary
    predict_with_generate = True,
    gradient_accumulation_steps = 4,
    eval_accumulation_steps = 4,
)
   

In [125]:
import evaluate  # Import the 'evaluate' library instead of 'datasets'

# Load your evaluation metric (e.g., ROUGE, BLEU, etc.)
metric = evaluate.load("rouge")  # Example: for ROUGE
metric

EvaluationModule(name: "rouge", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLsum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/

In [126]:
# Functions for further preprocessing and metrics computation
import numpy as np

def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [[label.strip()] for label in labels]

  return preds, labels

def compute_metrics(eval_preds):
  preds, labels = eval_preds
  if isinstance(preds, tuple):
    preds = preds[0]
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  # Replace -100 in the labes as we can't decode them.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  # Some simple post processing
  decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

  result = metric.compute(predictions = decoded_preds, references = decoded_labels)
  # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
  result = {key: value * 100 for key, value in result.items()}

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
  result["gen_len"] = np.mean(prediction_lens)

  return result

In [127]:
# Instantiate Trainer object (for fine-tuning)
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset = tokenized_datasets["train"],
    eval_dataset = tokenized_datasets["eval"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
    optimizers = (optimizer, lr_scheduler)
)

  trainer = Seq2SeqTrainer(


In [128]:
# Train time should take a few minutes or less if on GPU
# Can take up to several hours if on CPU
trainer.train()

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 3.8249013423919678, 'eval_rouge1': 23.72881355932203, 'eval_rouge2': 22.88135593220339, 'eval_rougeL': 23.870056497175142, 'eval_rougeLsum': 23.87005649717514, 'eval_gen_len': 7.593220338983051, 'eval_runtime': 4.6936, 'eval_samples_per_second': 25.141, 'eval_steps_per_second': 6.392, 'epoch': 0.99}


 20%|██        | 58/290 [00:29<01:36,  2.39it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Traine

{'eval_loss': 1.7276169061660767, 'eval_rouge1': 20.197740112994353, 'eval_rouge2': 19.491525423728813, 'eval_rougeL': 20.197740112994353, 'eval_rougeLsum': 20.480225988700564, 'eval_gen_len': 6.52542372881356, 'eval_runtime': 4.8581, 'eval_samples_per_second': 24.289, 'eval_steps_per_second': 6.175, 'epoch': 1.98}


 30%|███       | 87/290 [00:46<01:25,  2.36it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Traine

{'eval_loss': 1.3704434633255005, 'eval_rouge1': 20.62146892655367, 'eval_rouge2': 20.33898305084746, 'eval_rougeL': 20.621468926553675, 'eval_rougeLsum': 20.621468926553675, 'eval_gen_len': 6.5508474576271185, 'eval_runtime': 4.9171, 'eval_samples_per_second': 23.998, 'eval_steps_per_second': 6.101, 'epoch': 2.97}


 40%|████      | 117/290 [01:04<01:15,  2.31it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 1.0569559335708618, 'eval_rouge1': 17.231638418079097, 'eval_rouge2': 16.94915254237288, 'eval_rougeL': 17.231638418079097, 'eval_rougeLsum': 17.231638418079097, 'eval_gen_len': 6.559322033898305, 'eval_runtime': 4.9317, 'eval_samples_per_second': 23.927, 'eval_steps_per_second': 6.083, 'epoch': 4.0}


 50%|█████     | 146/290 [01:21<01:00,  2.37it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.8441233038902283, 'eval_rouge1': 16.38418079096045, 'eval_rouge2': 16.101694915254235, 'eval_rougeL': 16.384180790960453, 'eval_rougeLsum': 16.38418079096045, 'eval_gen_len': 6.52542372881356, 'eval_runtime': 4.9974, 'eval_samples_per_second': 23.612, 'eval_steps_per_second': 6.003, 'epoch': 4.99}


 60%|██████    | 175/290 [01:39<00:49,  2.32it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.6933839321136475, 'eval_rouge1': 16.38418079096045, 'eval_rouge2': 16.101694915254235, 'eval_rougeL': 16.384180790960453, 'eval_rougeLsum': 16.38418079096045, 'eval_gen_len': 6.508474576271187, 'eval_runtime': 5.0523, 'eval_samples_per_second': 23.356, 'eval_steps_per_second': 5.938, 'epoch': 5.98}


 70%|███████   | 204/290 [01:56<00:37,  2.32it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.6258186101913452, 'eval_rouge1': 18.079096045197744, 'eval_rouge2': 17.796610169491526, 'eval_rougeL': 18.079096045197744, 'eval_rougeLsum': 18.07909604519774, 'eval_gen_len': 6.559322033898305, 'eval_runtime': 4.9821, 'eval_samples_per_second': 23.685, 'eval_steps_per_second': 6.022, 'epoch': 6.97}


 81%|████████  | 234/290 [02:14<00:23,  2.37it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.5981793999671936, 'eval_rouge1': 16.38418079096045, 'eval_rouge2': 16.101694915254235, 'eval_rougeL': 16.384180790960453, 'eval_rougeLsum': 16.38418079096045, 'eval_gen_len': 6.5423728813559325, 'eval_runtime': 5.1257, 'eval_samples_per_second': 23.021, 'eval_steps_per_second': 5.853, 'epoch': 8.0}


 91%|█████████ | 263/290 [02:31<00:11,  2.31it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Train

{'eval_loss': 0.598124086856842, 'eval_rouge1': 16.38418079096045, 'eval_rouge2': 16.101694915254235, 'eval_rougeL': 16.384180790960453, 'eval_rougeLsum': 16.38418079096045, 'eval_gen_len': 6.5423728813559325, 'eval_runtime': 5.0807, 'eval_samples_per_second': 23.225, 'eval_steps_per_second': 5.905, 'epoch': 8.99}


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

{'eval_loss': 0.598124086856842, 'eval_rouge1': 16.38418079096045, 'eval_rouge2': 16.101694915254235, 'eval_rougeL': 16.384180790960453, 'eval_rougeLsum': 16.38418079096045, 'eval_gen_len': 6.5423728813559325, 'eval_runtime': 4.9372, 'eval_samples_per_second': 23.9, 'eval_steps_per_second': 6.076, 'epoch': 9.91}
{'train_runtime': 174.0652, 'train_samples_per_second': 26.886, 'train_steps_per_second': 1.666, 'train_loss': 2.250500909213362, 'epoch': 9.91}





TrainOutput(global_step=290, training_loss=2.250500909213362, metrics={'train_runtime': 174.0652, 'train_samples_per_second': 26.886, 'train_steps_per_second': 1.666, 'total_flos': 862531941826560.0, 'train_loss': 2.250500909213362, 'epoch': 9.914529914529915})

In [137]:
# Save the trained model
model.save_pretrained(constants.LLM_FINE_TUNED_SAVE_PATH.split('/')[-1])

In [138]:
# Save the tokenizer (vocab, special tokens, etc.)
tokenizer.save_pretrained(constants.LLM_FINE_TUNED_TOKENIZER_PATH.split('/')[-1])

('fine-tuned-tokenizer\\tokenizer_config.json',
 'fine-tuned-tokenizer\\special_tokens_map.json',
 'fine-tuned-tokenizer\\spiece.model',
 'fine-tuned-tokenizer\\added_tokens.json')

# Collect evaluation data predictions

In [129]:
%%time
# Try predictions on validation set for confirmation
predictions = trainer.predict(tokenized_datasets["eval"])

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  0%|          | 0/30 [00:00<?, ?it/s]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenize

CPU times: total: 1.69 s
Wall time: 5.36 s





In [130]:
predictions

PredictionOutput(predictions=array([[    0, 13597, 15800, ...,     0,     0,     0],
       [    0,  3479,     6, ...,     0,     0,     0],
       [    0,  8838,     6, ...,     0,     0,     0],
       ...,
       [    0,  8838,     6, ...,     0,     0,     0],
       [    0,  6180, 24656, ...,     0,     0,     0],
       [    0, 13597, 15800, ...,     0,     0,     0]]), label_ids=array([[14141,     4,  2689, ...,     0,     0,     0],
       [ 6464, 13601,    15, ...,     0,     0,     0],
       [ 6180, 24656,  9237, ...,     0,     0,     0],
       ...,
       [ 8537,     6, 14885, ...,     0,     0,     0],
       [ 8580,   308,  5937, ...,     0,     0,     0],
       [14141,     4,  2689, ...,     0,     0,     0]]), metrics={'test_loss': 0.598124086856842, 'test_rouge1': 16.38418079096045, 'test_rouge2': 16.101694915254235, 'test_rougeL': 16.384180790960453, 'test_rougeLsum': 16.38418079096045, 'test_gen_len': 6.5423728813559325, 'test_runtime': 5.3556, 'test_samples_per_s

In [133]:
# Convert tokens from data to text
def translate(tokens):
  my_list = tokenizer.convert_ids_to_tokens(tokens)
  new_list = [token for token in my_list if ('<' not in token)] # token != '' and token != ''
  new_string = ''.join(new_list)
  new_string = new_string.replace("▁", " ")
  # new_string = new_string.replace("_", " ")
  new_string = new_string.strip()
  return new_string
     

# Print sample predicted output
index = 30
print(tokenized_datasets["eval"]["input"][index])
print("Target product: ", tokenized_datasets["eval"]["output"][index])
print("Recommended product: ", translate(predictions.predictions[index]))
     


Based on the customer profile and search history, recommend the most relevant product with the format: key_id,product_name from CANDIDATES FOR RECOMMENDATION below:
Customer Profile: I love photography and gaming.
Search History: PC Gaming,Photo Editing Software,Wireless Earbuds,Gaming Mouse,Tripod,Photo Editing Software,Gaming Mouse,Gaming Console,Lens,Video Games,Gaming Headset
CANDIDATES FOR RECOMMENDATION: {2: 'Laptop', 26: 'PlayStation 5', 27: 'Xbox Series X', 28: 'Gaming Chair', 29: 'VR Headset', 30: 'Game Controller', 44: 'Baseball Glove'}

Target product:  28,Gaming Chair
Recommended product:  28,Gaming Chair


In [134]:
# Collect generated outputs and join with prompts and targets
model_generated = []
prompt_list = []
target_list = []

for i in range(len(predictions.predictions)):
  model_generated.append(translate(predictions.predictions[i]))

  prompt_list.append(dataset_eval['input'][i])
  target_list.append(dataset_eval['output'][i])
     

df_target_and_generated = pd.DataFrame()

df_target_and_generated['input'] = prompt_list
df_target_and_generated['target'] = target_list
df_target_and_generated['model_generated'] = model_generated

df_target_and_generated.to_csv('fine-tune-result.csv', index=False)

df_target_and_generated


Unnamed: 0,input,target,model_generated
0,Based on the customer profile and search histo...,"27,Xbox Series X","26,PlayStation 5"
1,Based on the customer profile and search histo...,"4,Wireless Earbuds","45,Baseball Glove"
2,Based on the customer profile and search histo...,"3,Smartwatch","43,Tennis Racket"
3,Based on the customer profile and search histo...,"23,Watch","43,Basketball"
4,Based on the customer profile and search histo...,"22,Sneakers","45,Soccer Cleats"
...,...,...,...
113,Based on the customer profile and search histo...,"44,Baseball Glove","44,Baseball Glove"
114,Based on the customer profile and search histo...,"26,PlayStation 5","28,Gaming Chair"
115,Based on the customer profile and search histo...,"44,Baseball Glove","43,Tennis Racket"
116,Based on the customer profile and search histo...,"6,Dumbbells","3,Smartwatch"


# Inferencing

In [139]:
model_inference = T5ForConditionalGeneration.from_pretrained(constants.LLM_FINE_TUNED_SAVE_PATH.split('/')[-1])

In [140]:
def generate_text(input_query, model, tokenizer, max_length=30):
    """
    Function to generate text using a fine-tuned T5 model.
    
    Parameters:
    - input_query (str): The input text for the model to process.
    - model: The pre-trained T5 model.
    - tokenizer: The tokenizer used for encoding the input and decoding the output.
    - max_length (int): Maximum length of the generated text.

    Returns:
    - str: The generated output text.
    """
    
    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Move model to the same device as the input
    model.to(device)
    
    # Tokenize the input query
    inputs = tokenizer(input_query, return_tensors="pt").to(device)
    
    # Set model to evaluation mode
    model.eval()
    
    # Disable gradient calculation during inference
    with torch.no_grad():
        # Generate text using the model
        outputs = model.generate(
            inputs['input_ids'], 
            max_length=max_length, 
            num_beams=5,  # You can adjust this for more diverse text
            early_stopping=True
        )
        
    # Decode the output tokens into text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text

# Example usage:
input_query = df_target_and_generated.iloc[0, 0]
generated_text = generate_text(input_query, model_inference, tokenizer)
generated_text

'26,PlayStation 5'