In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install transformers datasets sentencepiece

In [21]:
!pip install datasets rouge_score sacrebleu evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import torchvision
import torchvision.transforms as transforms
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration

In [None]:
dataset_path = '/kaggle/input/recipenlg/RecipeNLG_dataset.csv'
df = pd.read_csv(dataset_path)
df.head()

# Data Sampling

In [None]:
# Randomly sample 10,000 rows from the train dataset
sampled_data = df.sample(n=50000, random_state=42)

# Save the sampled dataset
sampled_data.to_csv("sampled_train.csv", index=False)


In [None]:
#compare stringified lists to python lists
import ast
sampled_data['ingredients'] = sampled_data['ingredients'].apply(ast.literal_eval)
sampled_data['directions'] = sampled_data['directions'].apply(ast.literal_eval)

In [None]:
#create input output pairs
sampled_data['input'] = sampled_data['ingredients'].apply(lambda x: "Ingredients: " + ", ".join(x))
sampled_data['output'] = sampled_data['directions'].apply(lambda x: " ".join(x))

In [None]:
train_df, test_df = train_test_split(sampled_data,test_size=0.2,random_state=42)

In [None]:
train_df.columns

In [None]:
from datasets import Dataset

# Convert train and test DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenization

In [9]:
#tokenizing the dataset
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
def tokenize_batch(batch):
  input_text = batch['input']
  output_text = batch['output']
  tokenized_input = tokenizer(input_text, padding='max_length',truncation=True, max_length=512)
  tokenized_output = tokenizer(output_text, padding='max_length',truncation=True, max_length=512)
  tokenized_input['labels'] = tokenized_output['input_ids']
  return tokenized_input

In [None]:
train_tokenized = train_dataset.map(tokenize_batch, batched=True)
test_tokenized = test_dataset.map(tokenize_batch, batched=True)

train_tokenized.save_to_disk("tokenized_train_dataset")
test_tokenized.save_to_disk("tokenized_test_dataset")

In [None]:
print(train_dataset.column_names)
print(test_dataset.column_names)

In [2]:
from datasets import load_from_disk

# Load tokenized dataset
train_tokenized = load_from_disk("tokenized_train_dataset")
test_tokenized = load_from_disk("tokenized_test_dataset")

# Dataloader to load the data

In [4]:
batch_size = 8
def collate_fn(batch):
    # Ensure every element in batch is converted to tensor, but ignore strings (such as tokenized words)
    #print(f"First batch item: {batch[0]}")
    #return {key: torch.tensor([d[key] for d in batch if isinstance(d[key], (list, int))]) for key in batch[0]}
    return {
        'input_ids': torch.tensor([d['input_ids'] for d in batch]),
        'attention_mask': torch.tensor([d['attention_mask'] for d in batch]),
        'labels': torch.tensor([d['labels'] for d in batch]),
    }

# Create DataLoaders
train_dataloader = DataLoader(
    train_tokenized, batch_size=4, shuffle=True, collate_fn=collate_fn
)

test_dataloader = DataLoader(
    test_tokenized, batch_size=4, shuffle=False, collate_fn=collate_fn
)

In [5]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-4)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Fine tuning the t5 model 

In [7]:
train_loss = []
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch in train_dataloader:
        optimizer.zero_grad()
        batch = {key: val.to(device) for key, val in batch.items()}
        outputs = model(
            input_ids = batch['input_ids'],
            attention_mask = batch['attention_mask'],
            labels = batch['labels']
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_train_loss=running_loss/len(train_dataloader)
    train_loss.append(avg_train_loss)
    print(f'Epoch {epoch+1}/{num_epochs} => Train Loss: {avg_train_loss:.4f}')

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/5 => Train Loss: 0.7226
Epoch 2/5 => Train Loss: 0.6146
Epoch 3/5 => Train Loss: 0.5829
Epoch 4/5 => Train Loss: 0.5625
Epoch 5/5 => Train Loss: 0.5471


In [10]:
model.save_pretrained("/kaggle/working/fine_tuned_t5")
tokenizer.save_pretrained("/kaggle/working/fine_tuned_t5")

('/kaggle/working/fine_tuned_t5/tokenizer_config.json',
 '/kaggle/working/fine_tuned_t5/special_tokens_map.json',
 '/kaggle/working/fine_tuned_t5/spiece.model',
 '/kaggle/working/fine_tuned_t5/added_tokens.json')

# Generating Recipes

In [15]:
def generate_recipe(ingredients):
    input_text = f"Ingredients: { ', '.join(ingredients)}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [16]:
ingredients = ['1 dozen eggs', '1 cup flour', '1 gallon milk', '1 cup sugar', '1 oz baking powder']
print(generate_recipe(ingredients))

Combine eggs, milk, sugar and baking powder in a bowl. Mix well. Pour into a greased 13 x 9-inch pan. Bake at 350 degrees for 30 minutes.


# Evaluating the model

In [27]:
#from tdqm import tdqm
def model_evaluate(test_dataloader,model,device,tokenizer):
    model.eval()
    predictions=[]
    references=[]
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        reference_texts = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
        with torch.no_grad():
            outputs = model.generate(input_ids,attention_mask=attention_mask, max_length=512, num_beams=4, early_stopping=True)
        generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(generated_texts)
        references.extend(reference_texts)
    return predictions,references

In [23]:
import evaluate
bleu_metric = evaluate.load('sacrebleu')
rouge_metric = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [24]:
def compute_metrics(predictions,references):
    bleu_score = bleu_metric.compute(predictions=predictions, references=[[ref] for ref in references])
    rouge_result = rouge_metric.compute(predictions=predictions, references=references)
    
    return {"bleu": bleu_score['score'], "rouge": rouge_result}

In [28]:
predictions,references = model_evaluate(test_dataloader,model,device,tokenizer)
metrics = compute_metrics(predictions, references)
print(f"BLEU Score: {metrics['bleu']}")
print(f"ROUGE Scores: {metrics['rouge']}")

BLEU Score: 7.390968610206515
ROUGE Scores: {'rouge1': 0.34327741972707637, 'rouge2': 0.12411536781680829, 'rougeL': 0.251304755136973, 'rougeLsum': 0.25135184546962785}


# Downloading the model

In [32]:
import shutil
from IPython.display import FileLink
shutil.make_archive('/kaggle/working/fine_tuned_t5','zip','/kaggle/working/fine_tuned_t5')
from IPython.display import FileLink

# Create a download link for the zipped model
FileLink("/kaggle/working/fine_tuned_t5.zip")
