In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers



In [3]:
import os
import json
import torch
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from torch.utils.data import random_split, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AdamW, get_linear_schedule_with_warmup
import random
import numpy as np
from tqdm.auto import tqdm
import time
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Constants
EXTRACTION_DIRECTORY = "/content/drive/My Drive/ChefAI/dataset"
CSV_FILE_NAME = "full_dataset.csv"
BATCH_SIZE = 8
EPOCHS = 1
MAX_LENGTH = 512
LEARNING_RATE = 5e-4
GRADIENT_ACCUMULATION_STEPS = 8
EPSILON = 1e-8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Loading the data
csv_file_path = os.path.join(EXTRACTION_DIRECTORY, CSV_FILE_NAME)
recipes_data = pd.read_csv(csv_file_path)

# Showing the number of recipes and a sample recipe
num_recipes = len(recipes_data)
sample_recipe = recipes_data.iloc[100] if not recipes_data.empty else "No recipes found"

num_recipes, sample_recipe

(2231142,
 Unnamed: 0                                                   100
 title                                                   Pancakes
 ingredients    ["1 c. flour", "1 tsp. soda", "1 tsp. salt", "...
 directions     ["Mix dry ingredients.", "Add egg, margarine a...
 link              www.cookbooks.com/Recipe-Details.aspx?id=57556
 source                                                  Gathered
 NER            ["flour", "soda", "salt", "sugar", "egg", "mar...
 Name: 100, dtype: object)

In [7]:
import re

def load_preprocess_raw_data(recipes_data):
    raw_list = []
    for _, recipe in recipes_data.iterrows():
        try:
            title = recipe['title'].strip().lower()
            ingredients = ", ".join(eval(recipe['ingredients'])).strip().lower()
            # Convert the string representation of directions to a list and then join into a single string.
            directions = " ".join(eval(recipe['directions'])).strip().lower()
            # Split the directions into sentences
            directions_sentences = re.split(r'\. +', directions)
            # Enumerate the sentences to create a numbered list, with the first item on a new line
            numbered_directions = "\n".join(f"{i+1}. {sentence.strip()}"
                                            for i, sentence in enumerate(directions_sentences)
                                            if sentence)  # Ensure no empty strings are added
            recipe_instance = '<|startofrecipe|>' + title + '<|startofingre|>' + ingredients + '<|startofinstruc|>' + numbered_directions + '<|endofrecipe|>'
            raw_list.append(recipe_instance)
        except Exception as e:
            print(f"An exception occurred for a row: {e}")
            continue
    return raw_list

preprocessed_recipes = load_preprocess_raw_data(recipes_data)
print(f"Number of preprocessed recipes: {len(preprocessed_recipes)}")



Number of preprocessed recipes: 2231142


In [9]:
# Print 5 random recipes
for _ in range(5):
    print(random.choice(preprocessed_recipes))
    print('-' * 100)

<|startofrecipe|>hot potato salad<|startofingre|>8 medium potatoes, cubed, cooked and cooled, 1/2 c. diced onion, 1 c. mayo, 1 c. shredded cheddar cheese, 1 c. shredded velveeta cheese, 1/2 lb. bacon, crisp and broken up<|startofinstruc|>1. mix together the cooled potatoes and all other ingredients
2. spread in greased pan
3. cover
4. bake at 350° for 30 minutes
5. uncover and bake for an additional 15 minutes.<|endofrecipe|>
----------------------------------------------------------------------------------------------------
<|startofrecipe|>chocolate pudding dipping pool<|startofingre|>1 , favorite flavor, 12 pieces bear-shaped cinnamon graham snacks, 1/4 cup sliced fresh strawberries, 4 fl oz (1/2 cup) fat-free milk<|startofinstruc|>1. spoon pudding into center of small bowl
2. arrange graham snacks and strawberries around sides of bowl
3. dip graham snacks and strawberries into pudding to eat
4. enjoy a small glass of milk with your treat.<|endofrecipe|>
----------------------------

In [10]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the tokenizer and distillgpt2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add special tokens
special_tokens_dict = {
    'bos_token': '<|startofrecipe|>',
    'eos_token': '<|endofrecipe|>',
    'pad_token': '<|pad|>',
    'additional_special_tokens': ['<|startofingre|>', '<|startofinstruc|>']
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print('Number of added special tokens:', num_added_toks)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Number of added special tokens: 5


In [11]:
class RecipeDataset(Dataset):
    def __init__(self, recipes, tokenizer, max_length):
        self.recipes = recipes
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.recipes)

    def __getitem__(self, idx):
        recipe = self.recipes[idx]
        inputs = self.tokenizer.encode_plus(
            recipe,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze()
        }

In [12]:
dataset = RecipeDataset(preprocessed_recipes, tokenizer, max_length=MAX_LENGTH)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

In [13]:
# Initialize GPT2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
model = model.to(DEVICE)

TOTAL_TRAINING_STEPS = int(len(train_dataloader) / GRADIENT_ACCUMULATION_STEPS * EPOCHS)
WARMUP_STEPS = int(0.1 * TOTAL_TRAINING_STEPS)

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=TOTAL_TRAINING_STEPS)


Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [14]:
base_save_path = "/content/drive/My Drive/ChefAI"
checkpoint_path = os.path.join(base_save_path, "model_checkpoints")
best_model_path = os.path.join(base_save_path, "best_model_for_inference")

# Function to load the model, optimizer, and scheduler states
def load_checkpoint(model, optimizer, scheduler, tokenizer, save_path):
    checkpoint = torch.load(os.path.join(save_path, 'checkpoint.pth'), map_location=DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    tokenizer.from_pretrained(save_path)  # Assuming tokenizer config is saved here
    return model, optimizer, scheduler, checkpoint['epoch'], checkpoint['best_val_loss']

# Function to save the model, optimizer, and scheduler states
def save_checkpoint(model, optimizer, scheduler, epoch, best_val_loss):
    os.makedirs(checkpoint_path, exist_ok=True)
    checkpoint = {
        'epoch': epoch,
        'best_val_loss': best_val_loss,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
    }
    torch.save(checkpoint, os.path.join(checkpoint_path, 'checkpoint.pth'))
    print(f"Checkpoint saved at epoch {epoch+1} with validation loss {best_val_loss}")

# Function to save the best model for inference
def save_best_model_for_inference(model, tokenizer):
    os.makedirs(best_model_path, exist_ok=True)
    model.save_pretrained(best_model_path)
    tokenizer.save_pretrained(best_model_path)
    print(f"Best model saved for inference at {best_model_path}")

start_epoch = 0
best_val_loss = float('inf')

# If a checkpoint exists, load it; otherwise, initialize the model and tokenizer from scratch
if os.path.exists(os.path.join(checkpoint_path, 'checkpoint.pth')):
    print("Loading checkpoint...")
    model, optimizer, scheduler, start_epoch, best_val_loss = load_checkpoint(model, optimizer, scheduler, tokenizer, checkpoint_path)
    print(f"Checkpoint loaded. Resuming training from epoch {start_epoch + 1}")
else:
    print("No checkpoint found. Starting training from scratch.")

def train_and_save_best_model(model, tokenizer, train_dataloader, val_dataloader, optimizer, scheduler, epochs, device, start_epoch=0, best_val_loss=float('inf')):
    training_stats = []

    for epoch_i in range(start_epoch, epochs):
        print(f'\n======== Epoch {epoch_i + 1} / {epochs} ========')
        total_train_loss = 0
        model.train()

        progress_bar = tqdm(train_dataloader, desc="Training", leave=False)

        for step, batch in enumerate(progress_bar):
            b_input_ids, b_attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)

            model.zero_grad()

            loss = model(b_input_ids, attention_mask=b_attention_mask, labels=b_input_ids).loss
            loss = loss / GRADIENT_ACCUMULATION_STEPS
            total_train_loss += loss.item()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                optimizer.step()
                scheduler.step()

            progress_bar.set_postfix({'Loss': f"{loss.item():.4f}"}, refresh=True)

        avg_train_loss = total_train_loss / len(train_dataloader)

        print("\nRunning Validation...")
        model.eval()
        total_eval_loss = 0
        progress_bar = tqdm(val_dataloader, desc="Validating", leave=False)

        for step, batch in enumerate(progress_bar):
            b_input_ids, b_attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_input_ids)
                loss = outputs.loss
                total_eval_loss += loss.item()

            progress_bar.set_postfix({'Loss': f"{loss.item():.4f}"}, refresh=True)

        avg_val_loss = total_eval_loss / len(val_dataloader)
        print(f"\n  Validation Loss: {avg_val_loss:.2f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            save_checkpoint(model, optimizer, scheduler, epoch_i, best_val_loss)
            save_best_model_for_inference(model, tokenizer, best_model_path)  # Save best model for inference
            print("New best model saved!")

        training_stats.append(
            {
                'Epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Validation Loss': avg_val_loss,
            }
        )

    print("\nTraining complete!")

    df_stats = pd.DataFrame(data=training_stats)
    df_stats = df_stats.set_index('Epoch')
    print(df_stats)

    sns.set(style='whitegrid', palette='deep', font_scale=1.1, rc={"figure.figsize": [8, 6]})
    plt.plot(df_stats['Training Loss'], 'b-o', label='Training')
    plt.plot(df_stats['Validation Loss'], 'g-o', label='Validation')
    plt.title('Training & Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.xticks(list(range(1, epochs+1)))
    plt.show()

train_and_save_best_model(model, tokenizer, train_dataloader, val_dataloader, optimizer, scheduler, EPOCHS, DEVICE, start_epoch, best_val_loss)

No checkpoint found. Starting training from scratch.



Training:   0%|          | 0/251004 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
model_path = "/content/drive/My Drive/ChefAI/best_model"

# Load trained model and tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Resize token embeddings
model.resize_token_embeddings(len(tokenizer))

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_recipe(ingredients, model, tokenizer, device, max_length=512, num_beams=5, no_repeat_ngram_size=2, num_return_sequences=1):
    # Prepare the input text with special tokens
    input_text = '<|startofrecipe|> <|startofingre|>' + ingredients + '<|startofinstruc|>''<|endofrecipe|>'
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate recipe using the model
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,
        no_repeat_ngram_size=no_repeat_ngram_size,
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.pad_token_id,
        early_stopping=True
    )

    # Convert the generated tokens to text
    generated_recipes = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(num_return_sequences)]

    return generated_recipes

# Test the function with an example
ingredients = "egg, salt, oil, onion"
generated_recipes = generate_recipe(ingredients, model, tokenizer, device)
print(generated_recipes[0])  # Print the first generated recipe