In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers



In [3]:
import os
import json
import torch
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from torch.utils.data import random_split, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AdamW, get_linear_schedule_with_warmup
import random
import numpy as np
from tqdm.auto import tqdm
import time
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Constants
EXTRACTION_DIRECTORY = "/content/drive/My Drive/ChefAI/rawdata"
BATCH_SIZE = 6
EPOCHS = 2
MAX_LENGTH = 512
LEARNING_RATE = 5e-4
WARMUP_STEPS = 1e2
EPSILON = 1e-8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Loading the data
json_files_paths = [os.path.join(EXTRACTION_DIRECTORY, file_name) for file_name in os.listdir(EXTRACTION_DIRECTORY) if file_name.endswith('.json')]
recipes_data = []
for json_file_path in json_files_paths:
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        recipes_data.extend(list(data.values()))

# Showing the number of recipes and a sample recipe
num_recipes = len(recipes_data)
sample_recipe = recipes_data[40000] if recipes_data else "No recipes found"

num_recipes, sample_recipe

(125164,
 {'title': 'Toffee Bar Coffee Cake',
  'ingredients': ['2 cups all-purpose flour ADVERTISEMENT',
   '3/4 cup white sugar ADVERTISEMENT',
   '3/4 cup brown sugar ADVERTISEMENT',
   '6 tablespoons butter, softened ADVERTISEMENT',
   '1 cup milk ADVERTISEMENT',
   '2 teaspoons baking powder ADVERTISEMENT',
   '1 teaspoon vanilla extract ADVERTISEMENT',
   '5 (1.4 ounce) bars chocolate covered toffee bars, chopped ADVERTISEMENT',
   '1 egg ADVERTISEMENT',
   '1/2 cup chopped, unsalted dry-roasted peanuts ADVERTISEMENT',
   'ADVERTISEMENT'],
  'instructions': 'Preheat oven to 350 degrees F (175 degrees C). Grease and flour a 9x13 inch pan. Crush toffee bars into small bits and set aside.\nIn a large bowl, combine flour, sugar, brown sugar and butter; mix on low speed with an electric mixer until crumbly. Remove 1/2 cup of crumb mixture and set aside to be used for topping. Add milk, baking powder, vanilla, egg, and 1/2 cup of the crushed toffee bars; beat at low speed until well-mi

In [6]:
def load_preprocess_raw_data(recipes_data):
    '''
    Take a list of recipe data and preprocess it,
    return a list of recipe instances with special tokens

    parameter: list of recipe data

    return: recipe instance list
    '''
    raw_list = []
    for recipe in recipes_data:
        # try/except will filter out recipes that don't have title, ingredients or instructions
        try:
            title = recipe['title'].replace("ADVERTISEMENT", "")
            ingredient_list = recipe['ingredients']
            ingredients = ""
            for ingredient in ingredient_list:
                ingredient = ingredient.replace("ADVERTISEMENT", "")
                if ingredient != "":
                    ingredients += ingredient + ", "
            instructions = recipe['instructions'].replace("ADVERTISEMENT", "")
            recipe_instance = '<|startofrecipe|>' + title + '<|startofingre|>' + ingredients + '<|startofinstruc|>' + instructions + '<|endofrecipe|>'
            if len(recipe_instance) <= 2000:
                raw_list.append(recipe_instance)

        except:
            continue
    return raw_list

# Apply preprocessing to the recipe data
preprocessed_recipes = load_preprocess_raw_data(recipes_data)

In [7]:
# Print 5 random recipes
for _ in range(5):
    print(random.choice(preprocessed_recipes))
    print('-' * 100)

<|startofrecipe|>Scones<|startofingre|>About 11 ounces all-purpose flour, About 5 ounces sugar, 2 eggs, 1 teaspoon baking powder, About 3 ounces butter, Milk, <|startofinstruc|>Preheat oven to 350 degrees F.
Mix all ingredients. Pour milk into the mix little by little until you have a dough, (a little stiff, not dry). Lightly flour your hands and roll the dough into little balls. Bake for about 12 minutes. Serve immediately.
It can be served plain, with butter or jam.<|endofrecipe|>
----------------------------------------------------------------------------------------------------
<|startofrecipe|>Chicken Alfredo with Fettuccini Noodles<|startofingre|>1 pound fettuccini pasta , 1 1/2 cups butter, divided , 1 pound skinless, boneless chicken breast halves - cut into cubes , 2 (16 ounce) containers whole milk ricotta cheese , 1 pint heavy cream , 1 teaspoon salt , 1 cup grated Parmesan cheese , <|startofinstruc|>Bring a large pot of lightly salted water to a boil. Add fettuccini and coo

In [8]:
# Load the tokenizer and distillgpt2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add special tokens
special_tokens_dict = {
    'bos_token': '<|startofrecipe|>',
    'eos_token': '<|endofrecipe|>',
    'pad_token': '<|pad|>',
    'additional_special_tokens': ['<|startofingre|>', '<|startofinstruc|>']
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print('Number of added special tokens:', num_added_toks)

Number of added special tokens: 5


In [9]:
class RecipeDataset(Dataset):
    def __init__(self, recipes, tokenizer, max_length):
        self.recipes = recipes
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.recipes)

    def __getitem__(self, idx):
        recipe = self.recipes[idx]
        inputs = self.tokenizer.encode_plus(
            recipe,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze()
        }

In [10]:
dataset = RecipeDataset(preprocessed_recipes, tokenizer, max_length=MAX_LENGTH)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

In [11]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
model = model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=len(train_dataloader) * EPOCHS)



In [1]:
def save_model(model, tokenizer, save_path):
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

def train_and_save_best_model(model, tokenizer, train_dataloader, val_dataloader, optimizer, scheduler, epochs, device, save_path):
    best_val_loss = float('inf')
    training_stats = []

    for epoch_i in range(epochs):
        print(f'\n======== Epoch {epoch_i + 1} / {epochs} ========')
        total_train_loss = 0
        model.train()

        progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
        for step, batch in enumerate(progress_bar):
            b_input_ids, b_attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)

            model.zero_grad()
            outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_input_ids)
            loss = outputs.loss
            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix({'Loss': f"{loss.item():.4f}"}, refresh=True)

        avg_train_loss = total_train_loss / len(train_dataloader)

        print("\nRunning Validation...")
        model.eval()
        total_eval_loss = 0
        progress_bar = tqdm(val_dataloader, desc="Validating", leave=False)

        for step, batch in enumerate(progress_bar):
            b_input_ids, b_attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_input_ids)
                loss = outputs.loss
                total_eval_loss += loss.item()

            progress_bar.set_postfix({'Loss': f"{loss.item():.4f}"}, refresh=True)

        avg_val_loss = total_eval_loss / len(val_dataloader)
        print(f"\n  Validation Loss: {avg_val_loss:.2f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            save_model(model, tokenizer, save_path)
            print("  New best model saved!")

        training_stats.append(
            {
                'Epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Validation Loss': avg_val_loss,
            }
        )

    print("\nTraining complete!")

    df_stats = pd.DataFrame(data=training_stats)
    df_stats = df_stats.set_index('Epoch')
    print(df_stats)

    sns.set(style='whitegrid', palette='deep', font_scale=1.1, rc={"figure.figsize": [8, 6]})
    plt.plot(df_stats['Training Loss'], 'b-o', label='Training')
    plt.plot(df_stats['Validation Loss'], 'g-o', label='Validation')
    plt.title('Training & Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.xticks(list(range(1, epochs+1)))
    plt.show()

save_path = "/content/drive/My Drive/ChefAI/best_model"
os.makedirs(save_path, exist_ok=True)
train_and_save_best_model(model, tokenizer, train_dataloader, val_dataloader, optimizer, scheduler, EPOCHS, DEVICE, save_path)


NameError: ignored

In [None]:
def generate_recipe(model, tokenizer, ingredients, max_length=100):
    prompt = "<|startofrecipe|><|startofingre|>" + ingredients + "<|startofinstruc|>"
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(DEVICE)

    output_ids = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=5,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
    )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output

def test_recipe_generation(ingredients):
    ingredients_text = ", ".join(ingredients)
    recipe = generate_recipe(model, tokenizer, ingredients_text)
    print("Generated Recipe:")
    print(recipe)

# Example usage
test_recipe_generation(["chicken", "onions", "garlic"])
