In [1]:
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("saldenisov/recipenlg")

print("Path to dataset files:", path)


download_path = kagglehub.dataset_download("saldenisov/recipenlg")

# List all files and directories
for root, dirs, files in os.walk(download_path):
    for file in files:
        print(os.path.join(root, file))


Downloading from https://www.kaggle.com/api/v1/datasets/download/saldenisov/recipenlg?dataset_version_number=1...


100%|██████████| 638M/638M [00:16<00:00, 40.4MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/saldenisov/recipenlg/versions/1
/root/.cache/kagglehub/datasets/saldenisov/recipenlg/versions/1/dataset/full_dataset.csv


In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel,
    TFGPT2LMHeadModel,
    BartTokenizer, BartForConditionalGeneration,
    Trainer, TrainingArguments,
    TextDataset, DataCollatorForLanguageModeling,
    pipeline
)
import torch
import tensorflow as tf
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Set dataset path
DATASET_PATH = "/kaggle/input/recipenlg/dataset/"


# Load the dataset
def load_data(path):
    # Read the CSV file
    df = pd.read_csv(os.path.join(path, "full_dataset.csv"))

    # Select and rename the required columns
    df = df[['title', 'NER', 'directions']].dropna()
    df = df.rename(columns={
        'NER': 'ingredients',
        'directions': 'instructions'
    })

    # Construct the prompt for model input
    df['prompt'] = "Title: " + df['title'] + "; Ingredients: " + df['ingredients']

    # Split into training and validation sets
    return train_test_split(df, test_size=0.1, random_state=42)


train_df, val_df = load_data(DATASET_PATH)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/recipenlg/dataset/full_dataset.csv'

In [None]:
# ========== GPT-2 (PyTorch) ==========

def prepare_gpt2_data(train_texts, val_texts, tokenizer, max_length=512):
    def save_to_file(data, filename):
        with open(filename, "w") as f:
            for line in data:
                f.write(line + "\n")

    train_file = "gpt2_train.txt"
    val_file = "gpt2_val.txt"
    save_to_file(train_texts, train_file)
    save_to_file(val_texts, val_file)

    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=max_length
    )
    val_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=val_file,
        block_size=max_length
    )
    return train_dataset, val_dataset

def train_gpt2_pytorch():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    train_texts = (train_df["prompt"] + "\n" + train_df["instructions"]).tolist()
    val_texts = (val_df["prompt"] + "\n" + val_df["instructions"]).tolist()

    train_dataset, val_dataset = prepare_gpt2_data(train_texts, val_texts, tokenizer)

    training_args = TrainingArguments(
        output_dir="./gpt2-pytorch-checkpoints",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_steps=1000,
        save_total_limit=2,
        evaluation_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    )

    trainer.train()
    return model, tokenizer

In [None]:
# ========== GPT-2 (TensorFlow) ==========

def train_gpt2_tensorflow():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model = TFGPT2LMHeadModel.from_pretrained("gpt2")

    inputs = tokenizer(
        (train_df["prompt"] + "\n" + train_df["instructions"]).tolist(),
        return_tensors="tf",
        padding=True,
        truncation=True,
        max_length=512
    )

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5))
    model.fit(inputs["input_ids"], inputs["input_ids"], epochs=3, batch_size=4)

    return model, tokenizer

In [None]:
# ========== BART (PyTorch) ==========

def train_bart():
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

    inputs = tokenizer(
        train_df["prompt"].tolist(),
        max_length=256,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    targets = tokenizer(
        train_df["instructions"].tolist(),
        max_length=256,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    class RecipeDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, targets):
            self.encodings = encodings
            self.targets = targets

        def __len__(self):
            return len(self.encodings.input_ids)

        def __getitem__(self, idx):
            return {
                "input_ids": self.encodings.input_ids[idx],
                "attention_mask": self.encodings.attention_mask[idx],
                "labels": self.targets.input_ids[idx]
            }

    dataset = RecipeDataset(inputs, targets)

    training_args = TrainingArguments(
        output_dir="./bart-checkpoints",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        logging_steps=10,
        save_steps=1000,
        evaluation_strategy="no"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset
    )

    trainer.train()
    return model, tokenizer

In [None]:
# IN TESTING

def evaluate_bart_bleu(model, tokenizer, val_df, num_samples=10):
    smoothing = SmoothingFunction().method1
    bleu_scores = []

    for i in range(num_samples):
        input_text = val_df["prompt"].iloc[i]
        reference = val_df["instructions"].iloc[i]

        # Encode the input and generate prediction
        inputs = tokenizer(input_text, return_tensors="pt", max_length=256, truncation=True)
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

        # Decode the output
        predicted = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # BLEU score
        reference_tokens = [reference.lower().split()]
        candidate_tokens = predicted.lower().split()
        score = sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing)
        bleu_scores.append(score)

        print(f"\nExample {i+1}")
        print("Prompt:", input_text)
        print("Reference:", reference)
        print("Generated:", predicted)
        print(f"BLEU Score: {round(score, 4)}")

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"\nAverage BLEU Score over {num_samples} samples: {round(avg_bleu, 4)}")

model, tokenizer = train_bart()
evaluate_bart_bleu(model, tokenizer, val_df, num_samples=10)

In [None]:
# IN TESTING

def evaluate_gpt2_bleu(model, tokenizer, val_df, num_samples=10):
    model.eval()
    smoothing = SmoothingFunction().method1
    bleu_scores = []

    for i in range(num_samples):
        prompt = val_df["prompt"].iloc[i]
        reference = val_df["instructions"].iloc[i]

        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        output = model.generate(
            input_ids=input_ids,
            max_length=150,
            num_beams=4,
            early_stopping=True
        )

        predicted = tokenizer.decode(output[0], skip_special_tokens=True)

        reference_tokens = [reference.lower().split()]
        candidate_tokens = predicted.lower().split()
        score = sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing)
        bleu_scores.append(score)

        print(f"\nExample {i+1}")
        print("Prompt:", prompt)
        print("Reference:", reference)
        print("Generated:", predicted)
        print(f"BLEU Score: {round(score, 4)}")

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"\nAverage BLEU Score over {num_samples} samples: {round(avg_bleu, 4)}")

model, tokenizer = train_gpt2_pytorch()
evaluate_gpt2_bleu(model, tokenizer, val_df, num_samples=10)

In [None]:
def evaluate_gpt2_tensorflow_bleu(model, tokenizer, val_df, num_samples=10):
    smoothing = SmoothingFunction().method1
    bleu_scores = []

    for i in range(num_samples):
        prompt = val_df["prompt"].iloc[i]
        reference = val_df["instructions"].iloc[i]

        # Encode the prompt
        input_ids = tokenizer.encode(prompt, return_tensors="tf", truncation=True, max_length=512)

        # Generate text from model
        output = model.generate(
            input_ids=input_ids,
            max_length=150,
            num_beams=4,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

        # Decode the prediction
        predicted = tokenizer.decode(output[0], skip_special_tokens=True)

        # Tokenize and compute BLEU
        reference_tokens = [reference.lower().split()]
        candidate_tokens = predicted.lower().split()
        score = sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing)
        bleu_scores.append(score)

        print(f"\nExample {i+1}")
        print("Prompt:", prompt)
        print("Reference:", reference)
        print("Generated:", predicted)
        print(f"BLEU Score: {round(score, 4)}")

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    print(f"\nAverage BLEU Score over {num_samples} samples: {round(avg_bleu, 4)}")

model, tokenizer = train_gpt2_tensorflow()
evaluate_gpt2_tensorflow_bleu(model, tokenizer, val_df, num_samples=10)

In [None]:
# Choose which model to run
if __name__ == "__main__":
    model_type = "gpt2_pt"  # Change to "gpt2_tf" or "bart"

    if model_type == "gpt2_pt":
        model, tokenizer = train_gpt2_pytorch()
        evaluate_gpt2_bleu(model, tokenizer, val_df)
    elif model_type == "gpt2_tf":
        model, tokenizer = train_gpt2_tensorflow()
        evaluate_gpt2_tensorflow_bleu(model, tokenizer, val_df)
    elif model_type == "bart":
        model, tokenizer = train_bart()
        evaluate_bart_bleu(model, tokenizer, val_df)