In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.6,max_split_size_mb:128,expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
%pip install --no-cache-dir -U transformers peft accelerate datasets evaluate bitsandbytes rouge_score

Collecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.3.0-py3-none-any.whl (506 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m287.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import ast
import warnings

warnings.filterwarnings('ignore')

file_path = "/kaggle/input/3a2mext/3A2M_EXTENDED.csv"
recipes = pd.read_csv(file_path)

print(f"Loaded {len(recipes)} recipes.")

recipes.dropna(subset=['title'], inplace=True)

recipes['title'] = recipes['title'].str.strip()
recipes['title'] = recipes['title'].str.encode('ascii', 'ignore').str.decode('ascii')

recipes['genre'] = recipes['genre'].str.lower()
recipes['genre'] = recipes['genre'].str.strip()

def parse_string_list(text):
    try:
        return ast.literal_eval(text)
    except (ValueError, SyntaxError):
        return []

list_columns = ['NER', 'Extended_NER', 'directions']

print("Converting string-based lists")
for col in list_columns:
    print(f"Processing '{col}'...")
    recipes[col] = recipes[col].apply(parse_string_list)

print("Conversion complete!")

print("\n Preprocessing Done. Here's the new data info: ")
recipes.info()

print("\n And here's the cleaned data: ")
print(recipes.head())

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/3a2mext/3A2M_EXTENDED.csv'

In [None]:
import random, math, ast
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

random_seed = 42
random.seed(random_seed)
rng = np.random.default_rng(random_seed)

recipe_book = recipes.copy()

def _ensure_list(x):
    if isinstance(x, list):
        return x
    try:
        return ast.literal_eval(x) if isinstance(x, str) and x.strip() else []
    except Exception:
        return []

for col in ("NER", "directions"):
    if col in recipe_book.columns:
        recipe_book[col] = recipe_book[col].apply(_ensure_list)
    else:
        recipe_book[col] = [[] for _ in range(len(recipe_book))]

mask_has_content = recipe_book['NER'].map(bool) & recipe_book['directions'].map(bool)
recipe_book = recipe_book[mask_has_content].reset_index(drop=True)

def build_recipe_string_for_length(row):
    title = (row.get('title') or "").strip()
    ingredients = row.get('NER') or []
    directions = row.get('directions') or []
    ing_text = "; ".join([str(i).strip() for i in ingredients if str(i).strip()])
    dir_text = " ".join([str(s).strip() for s in directions if str(s).strip()])
    return f"TITLE: {title}\nINGREDIENTS: {ing_text}\nDIRECTIONS: {dir_text}\n<|endoftext|>"

texts_for_len = recipe_book.apply(build_recipe_string_for_length, axis=1).tolist()
char_lengths = np.array([len(t) for t in texts_for_len], dtype=int)
length_stats_full = {
    "count": int(len(char_lengths)),
    "min": int(char_lengths.min()) if len(char_lengths) else 0,
    "25%": int(np.percentile(char_lengths,25)) if len(char_lengths) else 0,
    "median": int(np.median(char_lengths)) if len(char_lengths) else 0,
    "75%": int(np.percentile(char_lengths,75)) if len(char_lengths) else 0,
    "max": int(char_lengths.max()) if len(char_lengths) else 0,
    "mean": float(char_lengths.mean()) if len(char_lengths) else 0.0
}

desired_sample_size = 30_000
N = len(texts_for_len)
if desired_sample_size >= N:
    sample_indices = list(range(N))
else:
    n_bins = 20
    quantiles = np.linspace(0,100,n_bins+1)
    bin_edges = np.percentile(char_lengths, quantiles)
    bin_ids = np.digitize(char_lengths, bins=bin_edges[1:-1], right=True)
    indices_by_bin = [np.where(bin_ids==i)[0].tolist() for i in range(n_bins)]
    counts = np.array([len(idx) for idx in indices_by_bin], dtype=int)
    proportions = counts / counts.sum()
    raw_alloc = proportions * desired_sample_size
    alloc = np.floor(raw_alloc).astype(int)
    alloc = np.minimum(alloc, counts)
    current = alloc.sum()
    remainder = desired_sample_size - current
    if remainder > 0:
        remainders = raw_alloc - np.floor(raw_alloc)
        order = np.argsort(remainders)[::-1]
        for i in order:
            if remainder <= 0:
                break
            if alloc[i] < counts[i]:
                alloc[i] += 1
                remainder -= 1
    elif remainder < 0:
        deficit = -remainder
        order = np.argsort(raw_alloc - alloc)
        for i in order:
            if deficit <= 0:
                break
            remove = min(deficit, alloc[i])
            alloc[i] -= remove
            deficit -= remove
    for i in range(n_bins):
        if alloc[i] > counts[i]:
            alloc[i] = counts[i]
    sample_indices = []
    for i in range(n_bins):
        if alloc[i] <= 0:
            continue
        chosen = rng.choice(indices_by_bin[i], size=alloc[i], replace=False).tolist()
        sample_indices.extend(chosen)
    sampled_set = set(sample_indices)
    if len(sample_indices) < desired_sample_size:
        remaining_needed = desired_sample_size - len(sample_indices)
        all_remaining = [i for i in range(N) if i not in sampled_set]
        if len(all_remaining) <= remaining_needed:
            sample_indices.extend(all_remaining)
        else:
            extra = rng.choice(all_remaining, size=remaining_needed, replace=False).tolist()
            sample_indices.extend(extra)
    if len(sample_indices) > desired_sample_size:
        sample_indices = rng.choice(sample_indices, size=desired_sample_size, replace=False).tolist()
    rng.shuffle(sample_indices)

sampled_texts = [texts_for_len[i] for i in sample_indices]
hf_df = pd.DataFrame({"text": sampled_texts})
hf_ds = Dataset.from_pandas(hf_df)

try:
    word_packer = AutoTokenizer.from_pretrained("gpt2", use_fast=True, local_files_only=True)
except Exception:
    try:
        from transformers import GPT2TokenizerFast
        word_packer = GPT2TokenizerFast.from_pretrained("gpt2", local_files_only=True)
    except Exception:
        word_packer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)

if word_packer.pad_token is None:
    word_packer.pad_token = word_packer.eos_token

def tokenize_batch(batch):
    return word_packer(batch["text"], truncation=True, max_length=192, padding=False)

tokenized_recipes = hf_ds.map(tokenize_batch, batched=True, remove_columns=["text"])
lengths_tokens = np.array([len(ids) for ids in tokenized_recipes["input_ids"]], dtype=int)
token_length_stats_sampled = {
    "count": int(len(lengths_tokens)),
    "min": int(lengths_tokens.min()) if len(lengths_tokens) else 0,
    "25%": int(np.percentile(lengths_tokens,25)) if len(lengths_tokens) else 0,
    "median": int(np.median(lengths_tokens)) if len(lengths_tokens) else 0,
    "75%": int(np.percentile(lengths_tokens,75)) if len(lengths_tokens) else 0,
    "max": int(lengths_tokens.max()) if len(lengths_tokens) else 0,
    "mean": float(lengths_tokens.mean()) if len(lengths_tokens) else 0.0
}

import matplotlib.pyplot as plt
plt.figure(figsize=(8,5))
plt.hist(char_lengths, bins=40, edgecolor='black')
plt.title("Character Length Distribution (full)")
plt.xlabel("Characters per Recipe")
plt.ylabel("Frequency")
plt.grid(alpha=0.3)
plt.show()

plt.figure(figsize=(8,5))
plt.hist(lengths_tokens, bins=40, edgecolor='black')
plt.title("Token Length Distribution (sampled)")
plt.xlabel("Tokens per Recipe")
plt.ylabel("Frequency")
plt.grid(alpha=0.3)
plt.show()

save_path = "/kaggle/working/tokenized_recipes"
tokenized_recipes.save_to_disk(save_path)

data_collator = DataCollatorForLanguageModeling(tokenizer=word_packer, mlm=False)

print(f"total_recipes: {len(recipe_book)}")
print(f"sampled_dataset_size: {len(sampled_texts)}")
print(f"tokenized_sampled_size: {len(tokenized_recipes)}")
print(f"char_length_stats_full: {length_stats_full}")
print(f"token_length_stats_sampled: {token_length_stats_sampled}")
print(f"RECOMMENDED MAX_LENGTH (75th percentile): {token_length_stats_sampled['75%']}")
print(f"tokenizer_vocab_size: {len(word_packer)}")
print(f"example_formatted_recipe_first_1000_chars:\n{sampled_texts[0][:1000]}\n")
print(f"example_first_input_ids_sampled (first 40 tokens):\n{tokenized_recipes['input_ids'][0][:40]}")
print(f"saved_tokenized_dataset_to: {save_path}")

In [None]:
import os, random, math, torch, gc
from datasets import load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import GPT2TokenizerFast

random_seed = 42
random.seed(random_seed)
torch.manual_seed(random_seed)

TEST_RUN_SAMPLE_SIZE = None

tokenized_candidates = ["/kaggle/working/tokenized_recipes", "tokenized_recipes"]
tokenized_path = next((p for p in tokenized_candidates if os.path.exists(p)), None)
if tokenized_path is None:
    raise FileNotFoundError(f"Tokenized dataset not found in {tokenized_candidates}. Run tokenization cell first.")

dataset = load_from_disk(tokenized_path)

train_val_test = dataset.train_test_split(test_size=0.2, seed=random_seed)
train_val = train_val_test["train"]
test_dataset = train_val_test["test"]

train_val_split = train_val.train_test_split(test_size=0.25, seed=random_seed)
train_dataset = train_val_split["train"]
eval_dataset = train_val_split["test"]

del dataset, train_val_test, train_val, train_val_split
gc.collect()

if TEST_RUN_SAMPLE_SIZE:
    print(f"CONDUCTING TEST RUN WITH SAMPLE SIZE: {TEST_RUN_SAMPLE_SIZE} ")
    train_dataset = train_dataset.shuffle(seed=random_seed).select(range(min(TEST_RUN_SAMPLE_SIZE, len(train_dataset))))
    eval_dataset = eval_dataset.shuffle(seed=random_seed).select(range(min(TEST_RUN_SAMPLE_SIZE // 4, len(eval_dataset))))
    test_dataset = test_dataset.shuffle(seed=random_seed).select(range(min(TEST_RUN_SAMPLE_SIZE // 4, len(test_dataset))))

print(f"Dataset splits: Train={len(train_dataset)}, Val={len(eval_dataset)}, Test={len(test_dataset)}")

try:
    word_packer = AutoTokenizer.from_pretrained("gpt2", use_fast=True, local_files_only=True)
except Exception:
    try:
        word_packer = GPT2TokenizerFast.from_pretrained("gpt2", local_files_only=True)
    except Exception:
        word_packer = AutoTokenizer.from_pretrained("gpt2", use_fast=True)

if word_packer.pad_token is None:
    word_packer.pad_token = word_packer.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer=word_packer, mlm=False)

NUM_EPOCHS = 2
LORA_R = 8
PER_DEVICE_BATCH = 1
GRAD_ACCUM = 16
EVAL_STEPS = 500
DATALOADER_WORKERS = 0

compute_dtype = torch.float16 if torch.cuda.is_available() else torch.bfloat16

memory_saver_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=compute_dtype
)

if torch.cuda.is_available():
    device_index = torch.cuda.current_device()
    kitchen_gpt = AutoModelForCausalLM.from_pretrained("gpt2", device_map={"": device_index}, quantization_config=memory_saver_config)
else:
    kitchen_gpt = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto", quantization_config=memory_saver_config)

kitchen_gpt.config.use_cache = False
kitchen_gpt.resize_token_embeddings(len(word_packer))
kitchen_gpt = prepare_model_for_kbit_training(kitchen_gpt)

adapter_config = LoraConfig(r=LORA_R, lora_alpha=max(1, LORA_R * 2), target_modules=["c_attn", "c_proj", "c_fc"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")
kitchen_gpt = get_peft_model(kitchen_gpt, adapter_config)

def count_params(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable

_total, _trainable = count_params(kitchen_gpt)
print(f"model params total: {_total:,} — trainable: {_trainable:,} ({_trainable/_total:.4%})")

if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True

training_plan = TrainingArguments(
    output_dir="/kaggle/working/kitchen_gpt_ckpt",
    per_device_train_batch_size=PER_DEVICE_BATCH,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=1,
    learning_rate=2e-4,
    warmup_ratio=0.03,
    fp16=True if torch.cuda.is_available() else False,
    gradient_checkpointing=True,
    weight_decay=0.01,
    max_grad_norm=1.0,
    dataloader_num_workers=DATALOADER_WORKERS,
    report_to="none",
    logging_steps=EVAL_STEPS,
    save_strategy="no",
    eval_accumulation_steps=1,
    prediction_loss_only=True,
)

import numpy as np
def compute_metrics(eval_pred):
    if isinstance(eval_pred, (tuple, list)):
        logits, labels = eval_pred
    else:
        logits, labels = eval_pred.predictions, eval_pred.label_ids
    if logits is None:
        return {"accuracy": 0.0}
    if isinstance(logits, tuple):
        logits = logits[0]
    preds = np.argmax(logits, axis=-1)
    mask = labels != -100
    if mask.sum() == 0:
        return {"accuracy": 0.0}
    correct = (preds == labels) & mask
    acc = correct.sum() / mask.sum()
    return {"accuracy": float(acc)}

main_trainer = Trainer(
    model=kitchen_gpt,
    args=training_plan,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

for epoch in range(NUM_EPOCHS):
    train_result = main_trainer.train()
    del train_result
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    main_trainer.save_state()
    kitchen_gpt.save_pretrained(f"/kaggle/working/kitchen_gpt_lora_epoch{epoch+1}")

    eval_res = main_trainer.evaluate()
    eval_loss = eval_res.get("eval_loss") or eval_res.get("loss")
    if eval_loss is not None:
        try:
            ppl = math.exp(eval_loss)
        except Exception:
            ppl = float("inf")
        print(f"Validation loss: {eval_loss:.4f} -> Perplexity: {ppl:.2f}")
    else:
        print("No validation loss returned")

    del eval_res
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

test_pred = main_trainer.predict(test_dataset)
test_loss = test_pred.metrics.get("test_loss") or test_pred.metrics.get("eval_loss") or test_pred.metrics.get("loss")
if test_loss is not None:
    try:
        test_ppl = math.exp(test_loss)
    except Exception:
        test_ppl = float("inf")
    print(f"Test loss: {test_loss:.4f} -> Perplexity: {test_ppl:.2f}")
else:
    print("No test loss returned")

del test_pred
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

try:
    import evaluate
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")
    small_test_set = test_dataset.shuffle(seed=random_seed).select(range(min(200, len(test_dataset))))
    predictions = []
    references = []
    device = next(kitchen_gpt.parameters()).device

    for ex in small_test_set:
        if "text" in ex:
            raw = ex["text"]
        else:
            raw = word_packer.decode(ex["input_ids"], skip_special_tokens=True)

        if "DIRECTIONS:" in raw:
            prompt = raw.split("DIRECTIONS:")[0] + "DIRECTIONS:"
            ref = raw.split("DIRECTIONS:")[-1].replace("<|endoftext|>", "").strip()
        else:
            prompt = raw[:400]
            ref = ""

        inputs = word_packer(prompt, return_tensors="pt").to(device)
        out = kitchen_gpt.generate(**inputs, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, max_new_tokens=200, pad_token_id=word_packer.eos_token_id)
        gen = word_packer.decode(out[0], skip_special_tokens=True)
        pred = gen.replace(prompt, "").strip()
        predictions.append(pred)
        references.append(ref)

    rouge_res = rouge.compute(predictions=predictions, references=references)
    bleu_res = bleu.compute(predictions=[p.split() for p in predictions], references=[[r.split()] for r in references])
    print("ROUGE (test set):", {k: v for k, v in rouge_res.items()})
    print("BLEU (test set):", bleu_res)

    del rouge, bleu, small_test_set, predictions, references, inputs, out, gen, rouge_res, bleu_res

except Exception as e:
    print(f"Skipping ROUGE/BLEU/generation metrics due to error: {e}")

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

kitchen_gpt.eval()
prompts = [
    "TITLE: Cheesy Garlic Bread\nINGREDIENTS:",
    "TITLE: Quick Dinner\nINGREDIENTS: ['ground beef', 'onion', 'canned tomatoes', 'kidney beans']\nDIRECTIONS:"
]
device = next(kitchen_gpt.parameters()).device
for p in prompts:
    inputs = word_packer(p, return_tensors="pt").to(device)
    out = kitchen_gpt.generate(**inputs, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, max_new_tokens=350, pad_token_id=word_packer.eos_token_id)
    text = word_packer.decode(out[0], skip_special_tokens=True)
    print("\n GENERATED (truncated) \n", text[:1500])

del kitchen_gpt, main_trainer, word_packer, data_collator, train_dataset, eval_dataset, test_dataset
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

Dataset splits: Train=18000, Val=6000, Test=6000


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

model params total: 83,152,128 — trainable: 1,179,648 (1.4187%)


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,2.6117
1000,2.4792


Validation loss: 2.3295 -> Perplexity: 10.27


Step,Training Loss
