In [None]:
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("saldenisov/recipenlg")

print("Path to dataset files:", path)


download_path = kagglehub.dataset_download("saldenisov/recipenlg")

# List all files and directories
for root, dirs, files in os.walk(download_path):
    for file in files:
        print(os.path.join(root, file))


Path to dataset files: /kaggle/input/recipenlg
/kaggle/input/recipenlg/dataset/full_dataset.csv


In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    GPT2Tokenizer, GPT2LMHeadModel,
    TFGPT2LMHeadModel,
    BartTokenizer, BartForConditionalGeneration,
    Trainer, TrainingArguments,
    TextDataset, DataCollatorForLanguageModeling,
    pipeline
)
import torch
import tensorflow as tf

# Set dataset path
#DATASET_PATH = "/root/.cache/kagglehub/datasets/saldenisov/recipenlg/versions/1/dataset/"
DATASET_PATH = "/kaggle/input/recipenlg/dataset/"

# Load the dataset
def load_data(path):
    # Read the CSV file
    df = pd.read_csv(os.path.join(path, "full_dataset.csv"))

    # Select and rename the required columns
    df = df[['title', 'NER', 'directions']].dropna()
    df = df.rename(columns={
        'NER': 'ingredients',
        'directions': 'instructions'
    })

    # Construct the prompt for model input
    df['prompt'] = "Title: " + df['title'] + "; Ingredients: " + df['ingredients']

    # Split into training and validation sets
    return train_test_split(df, test_size=0.1, random_state=42)


train_df, val_df = load_data(DATASET_PATH)

In [None]:
# ========== GPT-2 (PyTorch) ==========

def prepare_gpt2_data(train_texts, val_texts, tokenizer, max_length=512):
    def save_to_file(data, filename):
        with open(filename, "w") as f:
            for line in data:
                f.write(line + "\n")

    train_file = "gpt2_train.txt"
    val_file = "gpt2_val.txt"
    save_to_file(train_texts, train_file)
    save_to_file(val_texts, val_file)

    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=max_length
    )
    val_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=val_file,
        block_size=max_length
    )
    return train_dataset, val_dataset

def train_gpt2_pytorch():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    train_texts = (train_df["prompt"] + "\n" + train_df["instructions"]).tolist()
    val_texts = (val_df["prompt"] + "\n" + val_df["instructions"]).tolist()

    train_dataset, val_dataset = prepare_gpt2_data(train_texts, val_texts, tokenizer)

    training_args = TrainingArguments(
        output_dir="./gpt2-pytorch-checkpoints",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_steps=1000,
        save_total_limit=2,
        evaluation_strategy="epoch"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
    )

    trainer.train()
    return model, tokenizer

In [None]:
# ========== GPT-2 (TensorFlow) ==========

def train_gpt2_tensorflow():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token
    model = TFGPT2LMHeadModel.from_pretrained("gpt2")

    inputs = tokenizer(
        (train_df["prompt"] + "\n" + train_df["instructions"]).tolist(),
        return_tensors="tf",
        padding=True,
        truncation=True,
        max_length=512
    )

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5))
    model.fit(inputs["input_ids"], inputs["input_ids"], epochs=3, batch_size=4)

    return model, tokenizer

In [None]:
# ========== BART (PyTorch) ==========

def train_bart():
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

    inputs = tokenizer(
        train_df["prompt"].tolist(),
        max_length=256,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    targets = tokenizer(
        train_df["instructions"].tolist(),
        max_length=256,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    class RecipeDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, targets):
            self.encodings = encodings
            self.targets = targets

        def __len__(self):
            return len(self.encodings.input_ids)

        def __getitem__(self, idx):
            return {
                "input_ids": self.encodings.input_ids[idx],
                "attention_mask": self.encodings.attention_mask[idx],
                "labels": self.targets.input_ids[idx]
            }

    dataset = RecipeDataset(inputs, targets)

    training_args = TrainingArguments(
        output_dir="./bart-checkpoints",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        logging_steps=10,
        save_steps=1000,
        evaluation_strategy="no"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset
    )

    trainer.train()
    return model, tokenizer

In [None]:
# Choose which model to run
if __name__ == "__main__":
    model_type = "bart"  # Change to "gpt2_tf" or "gpt2_pt"

    if model_type == "gpt2_pt":
        model, tokenizer = train_gpt2_pytorch()
    elif model_type == "gpt2_tf":
        model, tokenizer = train_gpt2_tensorflow()
    elif model_type == "bart":
        model, tokenizer = train_bart()