# Creating a Recipe Generator

In [2]:
%pip install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.10.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (31 kB)
Collecting torchvision
  Downloading torchvision-0.25.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (5.4 kB)
Collecting torchaudio
  Downloading torchaudio-2.10.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (6.9 kB)
Collecting filelock (from torch)
  Downloading filelock-3.20.3-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Downloading networkx-3.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting cuda-bindings==12.9.4 (from torch)
  Downloading cuda_bindings-12.9.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (2.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90

In [8]:
%pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.12.0-cp313-cp313-manylinux_2_28_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m6.4 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.12.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import numpy as np 
import pandas as pd 

import torch

import os
import tiktoken


In [3]:
# --- Architecture Hyperparameters ---
block_size = 512       # Context window (long enough for a full recipe)
n_embd = 384           # Embedding dimension
n_head = 6             # Number of attention heads
n_layer = 6            # Number of transformer blocks
dropout = 0.2          # Higher dropout to prevent memorization of specific recipes

# --- Training Hyperparameters ---
batch_size = 32        # How many recipes to process at once
learning_rate = 3e-4   # The "sweet spot" for small transformers
max_iters = 5000       # Total training steps
eval_interval = 500    # How often to check validation loss
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# --- Special Control Tokens ---
# These are the "switches" for your model
VEGAN_TOKEN = "[VEGAN]"
NORMAL_TOKEN = "[NORMAL]"
TITLE_TOKEN = "[TITLE]"
INGRED_TOKEN = "[INGRED]"
STEPS_TOKEN = "[STEPS]"
EOS_TOKEN = "<|endoftext|>"

## 1). Data Pre-Processing 

In [4]:
# Define your tokens here
VEGAN_TOKEN, NORMAL_TOKEN = "[VEGAN]", "[NORMAL]"
TITLE_TOKEN, INGRED_TOKEN, STEPS_TOKEN = "[TITLE]", "[INGRED]", "[STEPS]"
EOS_TOKEN = "<|endoftext|>"

def create_training_data(csv_path, output_filename="training_data.txt"):
    # 1. Use 'chunksize' to load only 10,000 recipes at a time
    # This prevents your RAM from filling up
    reader = pd.read_csv(csv_path, chunksize=10000)
    animal_prods = {'milk', 'eggs', 'butter', 'meat', 'beef', 'chicken', 'fish', 'cheese', 'lard'}
    
    count = 0
    # Open the file once and append to it in the loop
    with open(output_filename, "w", encoding="utf-8") as f:
        for chunk in reader:
            for _, row in chunk.iterrows():
                try:
                    # Parse lists safely
                    ner_list = eval(row['NER'])
                    directions = eval(row['directions'])
                    
                    # Logic
                    is_vegan = all(item not in animal_prods for item in ner_list)
                    diet_tag = VEGAN_TOKEN if is_vegan else NORMAL_TOKEN
                    
                    # Build string
                    recipe_str = (
                        f"{diet_tag} {TITLE_TOKEN} {row['title']} "
                        f"{INGRED_TOKEN} {', '.join(ner_list)} "
                        f"{STEPS_TOKEN} {' '.join(directions)} {EOS_TOKEN}\n"
                    )
                    
                    # Write directly to disk
                    f.write(recipe_str)
                    count += 1
                except:
                    continue # Skip rows with broken data
            
            print(f"Processed {count} recipes...")

    print(f"\n--- DONE ---")
    print(f"Successfully saved {count} recipes to {output_filename}")
    # NO RETURN STATEMENT HERE. The data is safe on your disk.

# Execute
create_training_data('/home/onyxia/work/Recipe-Generator/data/full_dataset.csv')

Processed 10000 recipes...
Processed 20000 recipes...
Processed 30000 recipes...
Processed 40000 recipes...
Processed 50000 recipes...
Processed 60000 recipes...
Processed 70000 recipes...
Processed 80000 recipes...
Processed 90000 recipes...
Processed 100000 recipes...
Processed 110000 recipes...
Processed 120000 recipes...
Processed 130000 recipes...
Processed 140000 recipes...
Processed 150000 recipes...
Processed 160000 recipes...
Processed 170000 recipes...
Processed 180000 recipes...
Processed 190000 recipes...
Processed 200000 recipes...
Processed 210000 recipes...
Processed 220000 recipes...
Processed 230000 recipes...
Processed 240000 recipes...
Processed 250000 recipes...
Processed 260000 recipes...
Processed 270000 recipes...
Processed 280000 recipes...
Processed 290000 recipes...
Processed 300000 recipes...
Processed 310000 recipes...
Processed 320000 recipes...
Processed 330000 recipes...
Processed 340000 recipes...
Processed 350000 recipes...
Processed 360000 recipes...
P

In [6]:
def count_recipes_efficiently(filepath):
    print(f"Counting recipes in {filepath}...")
    count = 0
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            # We only count lines that aren't empty
            if line.strip():
                count += 1
    
    print(f"Total recipes found: {count:,}")
    return count

# Usage:
count_recipes_efficiently('training_data.txt')

Counting recipes in training_data.txt...
Total recipes found: 2,255,898


2255898

## 2). Tokenizing text

In [11]:
def tokenize_and_save(input_file, output_bin, chunk_size_mb=100):
    """
    Tokenizes text in chunks and streams directly to a binary file.
    This ensures we don't crash RAM and the process is 100% reproducible.
    """
    # 1. Initialize the tokenizer (GPT-2 encoding matches tiktoken's default)
    enc = tiktoken.get_encoding("gpt2")
    
    if not os.path.exists(input_file):
        print(f"Error: {input_file} not found. Please run your data prep script first.")
        return

    print(f"Tokenizing {input_file} -> {output_bin}...")
    
    # 2. Open binary file for writing
    with open(output_bin, 'wb') as bin_f:
        with open(input_file, 'r', encoding='utf-8') as f:
            chunk_count = 0
            while True:
                # Read chunks (100MB is a good balance for speed/RAM)
                text_chunk = f.read(1024 * 1024 * chunk_size_mb)
                if not text_chunk:
                    break
                
                # Encode chunk to token IDs
                ids = enc.encode_ordinary(text_chunk)
                
                # Convert to uint16 (efficiently handles vocab up to 65,535)
                # GPT-2 vocab is 50,257, so uint16 saves 50% space vs int32
                ids_array = np.array(ids, dtype=np.uint16)
                
                # Save chunk to disk immediately
                bin_f.write(ids_array.tobytes())
                
                chunk_count += 1
                print(f"Processed chunk {chunk_count} ({chunk_size_mb * chunk_count}MB total)...")

    print("\n--- Tokenization Complete ---")
    print(f"Binary file saved as: {output_bin}")
    print(f"File size: {os.path.getsize(output_bin) / (1024*1024):.2f} MB")

# Run the process
tokenize_and_save('training_data.txt', 'train.bin')

Tokenizing training_data.txt -> train.bin...
Processed chunk 1 (100MB total)...
Processed chunk 2 (200MB total)...
Processed chunk 3 (300MB total)...
Processed chunk 4 (400MB total)...
Processed chunk 5 (500MB total)...
Processed chunk 6 (600MB total)...
Processed chunk 7 (700MB total)...
Processed chunk 8 (800MB total)...
Processed chunk 9 (900MB total)...
Processed chunk 10 (1000MB total)...
Processed chunk 11 (1100MB total)...
Processed chunk 12 (1200MB total)...
Processed chunk 13 (1300MB total)...
Processed chunk 14 (1400MB total)...

--- Tokenization Complete ---
Binary file saved as: train.bin
File size: 707.31 MB


## 3). Create DataLoaders

In [12]:
class DataLoader:
    def __init__(self, bin_file, batch_size, block_size):
        self.batch_size = batch_size
        self.block_size = block_size
        
        # Memory-map the binary file (The Pro Move)
        # This keeps RAM usage at nearly 0MB
        self.data = np.memmap(bin_file, dtype=np.uint16, mode='r')
        print(f"DataLoader initialized. Dataset has {len(self.data):,} tokens.")

    def get_batch(self):
        # Pick random starting points for the whole batch
        # We leave room for block_size + 1 so we can get the target y
        ix = torch.randint(len(self.data) - self.block_size, (self.batch_size,))
        
        # Pull the sequences from the memory map
        x = torch.stack([torch.from_numpy((self.data[i:i+self.block_size]).astype(np.int64)) for i in ix])
        y = torch.stack([torch.from_numpy((self.data[i+1:i+1+self.block_size]).astype(np.int64)) for i in ix])
        
        # Move to GPU if available
        if torch.cuda.is_available():
            x, y = x.pin_memory().cuda(non_blocking=True), y.pin_memory().cuda(non_blocking=True)
            
        return x, y

train_loader = DataLoader('train.bin', batch_size, block_size)
x, y = train_loader.get_batch()
print(f"X shape: {x.shape}, Y shape: {y.shape}")

DataLoader initialized. Dataset has 370,832,540 tokens.
X shape: torch.Size([32, 512]), Y shape: torch.Size([32, 512])
