In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Col

In [3]:
import os
import json
import torch
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import random_split, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AdamW, get_linear_schedule_with_warmup
import random
import numpy as np
from tqdm.auto import tqdm
import time
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Constants
EXTRACTION_DIRECTORY = "/content/drive/My Drive/ChefAI/dataset"
CSV_FILE_NAME = "full_dataset.csv"
BATCH_SIZE = 8
EPOCHS = 2
MAX_LENGTH = 512
LEARNING_RATE = 2e-4
GRADIENT_ACCUMULATION_STEPS = 1
EPSILON = 1e-8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Loading the data
csv_file_path = os.path.join(EXTRACTION_DIRECTORY, CSV_FILE_NAME)
recipes_data = pd.read_csv(csv_file_path)

sampled_recipes_data = recipes_data.sample(n=250000, random_state=83)

sample_recipe = sampled_recipes_data.iloc[1000]
print(sample_recipe)

Unnamed: 0                                                717506
title                                           Crazy Corn Bread
ingredients    ["2 boxes Jiffy cornbread mix", "2 eggs", "1 (...
directions     ["Mix all together. Bake at 350\u00b0 for 45 m...
link             www.cookbooks.com/Recipe-Details.aspx?id=118444
source                                                  Gathered
NER              ["cornbread mix", "eggs", "corn", "sour cream"]
Name: 717506, dtype: object


In [6]:
import re

def load_preprocess_raw_data(recipes_data):
    raw_list = []
    for _, recipe in recipes_data.iterrows():
        try:
            title = recipe['title'].strip().lower()
            ingredients = ", ".join(eval(recipe['ingredients'])).strip().lower()
            # Convert the string representation of directions to a list and then join into a single string.
            directions = " ".join(eval(recipe['directions'])).strip().lower()
            # Split the directions into sentences
            directions_sentences = re.split(r'\. +', directions)
            # Enumerate the sentences to create a numbered list, with the first item on a new line
            numbered_directions = "\n".join(f"{i+1}. {sentence.strip()}"
                                            for i, sentence in enumerate(directions_sentences)
                                            if sentence)  # Ensure no empty strings are added
            recipe_instance = '<|startofrecipe|>' + title + '<|startofingre|>' + ingredients + '<|startofinstruc|>' + numbered_directions + '<|endofrecipe|>'
            raw_list.append(recipe_instance)
        except Exception as e:
            print(f"An exception occurred for a row: {e}")
            continue
    return raw_list

# Preprocess the sampled recipes
preprocessed_recipes = load_preprocess_raw_data(sampled_recipes_data)

# Print the number of preprocessed recipes
print(f"Number of preprocessed recipes: {len(preprocessed_recipes)}")



Number of preprocessed recipes: 250000


In [7]:
# Print 5 random recipes
for _ in range(5):
    print(random.choice(preprocessed_recipes))
    print('-' * 100)

<|startofrecipe|>breakfast goulash<|startofingre|>1 small onion, chopped, 1 lb ground sausage, 12 eggs, 1 cup shredded cheddar cheese<|startofinstruc|>1. cook sausage and onions on medium heat until sausage is no longer pink
2. in a medium size bowl beat eggs (add two tablespoons of milk if desired) then add to sausage
3. cook and scramble eggs until almost completely set
4. add cheese and stir until eggs are done and cheese is melted.<|endofrecipe|>
----------------------------------------------------------------------------------------------------
<|startofrecipe|>escabeche, sweet and sour<|startofingre|>500 grams any salt water fish, regular in size for each person, 1 bellpepper, red, 1 carrot, to taste salt<|startofinstruc|>1. wash the fish, clean and scale
2. sprinkle salt and set aside
3. prepare the spices: clean and cut
4. set aside
5. fry the fish and set aside
6. drain the used cooking oil from frying
7. from the same pan used in frying saute onions, garlic, ginger, carrots a

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the tokenizer and distillgpt2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add special tokens
special_tokens_dict = {
    'bos_token': '<|startofrecipe|>',
    'eos_token': '<|endofrecipe|>',
    'pad_token': '<|pad|>',
    'additional_special_tokens': ['<|startofingre|>', '<|startofinstruc|>']
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print('Number of added special tokens:', num_added_toks)

In [8]:
class RecipeDataset(Dataset):
    def __init__(self, recipes, tokenizer, max_length):
        self.recipes = recipes
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.recipes)

    def __getitem__(self, idx):
        recipe = self.recipes[idx]
        inputs = self.tokenizer.encode_plus(
            recipe,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze()
        }

In [9]:
#When retraining

base_save_path = "/content/drive/My Drive/ChefAI"
checkpoint_path = os.path.join(base_save_path, "model_checkpoints")
best_model_path = os.path.join(base_save_path, "best_model_for_inference")

tokenizer = GPT2Tokenizer.from_pretrained(best_model_path)
model = GPT2LMHeadModel.from_pretrained(best_model_path)
model = model.to(DEVICE)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
dataset = RecipeDataset(preprocessed_recipes, tokenizer, max_length=MAX_LENGTH)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

In [11]:
TOTAL_TRAINING_STEPS = int(len(train_dataloader) / GRADIENT_ACCUMULATION_STEPS * EPOCHS)
WARMUP_STEPS = int(0.1 * TOTAL_TRAINING_STEPS)

# Initialize optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=TOTAL_TRAINING_STEPS)

In [14]:
# Initialize GPT2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
model = model.to(DEVICE)

TOTAL_TRAINING_STEPS = int(len(train_dataloader) / GRADIENT_ACCUMULATION_STEPS * EPOCHS)

# Initialize optimizer and scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, eps=EPSILON)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=TOTAL_TRAINING_STEPS)


In [None]:
# Function to load the model, optimizer, and scheduler states
def load_checkpoint(model, optimizer, scheduler, tokenizer, checkpoint_path, best_model_path ):
    checkpoint = torch.load(os.path.join(checkpoint_path, 'checkpoint.pth'), map_location=DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    tokenizer.from_pretrained(best_model_path)  # Assuming tokenizer config is saved here
    return model, optimizer, scheduler, checkpoint['epoch'], checkpoint['best_val_loss']

# Function to save the model, optimizer, and scheduler states
def save_checkpoint(model, optimizer, scheduler, epoch, best_val_loss):
    os.makedirs(checkpoint_path, exist_ok=True)
    checkpoint = {
        'epoch': epoch,
        'best_val_loss': best_val_loss,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
    }
    torch.save(checkpoint, os.path.join(checkpoint_path, 'checkpoint.pth'))
    print(f"Checkpoint saved at epoch {epoch+1} with validation loss {best_val_loss}")

# Function to save the best model for inference
def save_best_model_for_inference(model, tokenizer, best_model_path):
    os.makedirs(best_model_path, exist_ok=True)
    model.save_pretrained(best_model_path)
    tokenizer.save_pretrained(best_model_path)
    print(f"Best model saved for inference at {best_model_path}")

start_epoch = 0
best_val_loss = float('inf')

# If a checkpoint exists, load it; otherwise, initialize the model and tokenizer from scratch
if os.path.exists(os.path.join(checkpoint_path, 'checkpoint.pth')):
    print("Loading checkpoint...")
    model, optimizer, scheduler, start_epoch, best_val_loss = load_checkpoint(model, optimizer, scheduler, tokenizer, checkpoint_path, best_model_path)
    model = model.to(DEVICE)
    print(f"Checkpoint loaded. Resuming training from epoch {start_epoch + 1}")
else:
    print("No checkpoint found. Starting training from scratch.")

def train_and_save_best_model(model, tokenizer, train_dataloader, val_dataloader, optimizer, scheduler, epochs, device, start_epoch=0, best_val_loss=float('inf')):
    training_stats = []

    for epoch_i in range(start_epoch, epochs):
        print(f'\n======== Epoch {epoch_i + 1} / {epochs} ========')
        total_train_loss = 0
        model.train()

        progress_bar = tqdm(train_dataloader, desc="Training", leave=False)

        for step, batch in enumerate(progress_bar):
            b_input_ids, b_attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)

            model.zero_grad()

            loss = model(b_input_ids, attention_mask=b_attention_mask, labels=b_input_ids).loss
            loss = loss / GRADIENT_ACCUMULATION_STEPS
            total_train_loss += loss.item()
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                optimizer.step()
                scheduler.step()

            progress_bar.set_postfix({'Loss': f"{loss.item():.4f}"}, refresh=True)

        avg_train_loss = total_train_loss / len(train_dataloader)

        print("\nRunning Validation...")
        model.eval()
        total_eval_loss = 0
        progress_bar = tqdm(val_dataloader, desc="Validating", leave=False)

        for step, batch in enumerate(progress_bar):
            b_input_ids, b_attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_input_ids)
                loss = outputs.loss
                total_eval_loss += loss.item()

            progress_bar.set_postfix({'Loss': f"{loss.item():.4f}"}, refresh=True)

        avg_val_loss = total_eval_loss / len(val_dataloader)
        print(f"\n  Validation Loss: {avg_val_loss:.2f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            save_checkpoint(model, optimizer, scheduler, epoch_i, best_val_loss)
            save_best_model_for_inference(model, tokenizer, best_model_path)  # Save best model for inference
            print("New best model saved!")

        training_stats.append(
            {
                'Epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Validation Loss': avg_val_loss,
            }
        )

    print("\nTraining complete!")

    df_stats = pd.DataFrame(data=training_stats)
    df_stats = df_stats.set_index('Epoch')
    print(df_stats)

    sns.set(style='whitegrid', palette='deep', font_scale=1.1, rc={"figure.figsize": [8, 6]})
    plt.plot(df_stats['Training Loss'], 'b-o', label='Training')
    plt.plot(df_stats['Validation Loss'], 'g-o', label='Validation')
    plt.title('Training & Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.xticks(list(range(1, epochs+1)))
    plt.show()

train_and_save_best_model(model, tokenizer, train_dataloader, val_dataloader, optimizer, scheduler, EPOCHS, DEVICE, start_epoch, best_val_loss)

Loading checkpoint...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Checkpoint loaded. Resuming training from epoch 1



Training:   0%|          | 0/28125 [00:00<?, ?it/s]

In [5]:
model_path = "/content/drive/My Drive/ChefAI/best_model_for_inference"

# Load trained model and tokenizer from the saved model path
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Resize token embeddings
#model.resize_token_embeddings(len(tokenizer))

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_recipe(ingredients, model, tokenizer, device, max_length=90, num_beams=5, no_repeat_ngram_size=3, num_return_sequences=1):
    # Prepare the input text with special tokens
    input_text = '<|startofrecipe|>''<|startofingre|>' + ingredients + '<|startofinstruc|>''<|endofrecipe|>'
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate recipe using the model
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,
        temperature=0.8,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=no_repeat_ngram_size,
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.pad_token_id,
        early_stopping=True
    )

    # Convert the generated tokens to text
    generated_recipes = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(num_return_sequences)]

    return generated_recipes

# Test the function with an example
ingredients = "oil, egg, onion"
generated_recipes = generate_recipe(ingredients, model, tokenizer, device)
print(generated_recipes[0])  # Print the first generated recipe

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


oil, egg, onion1. heat oil in frying pan
2. add egg and onion
3. fry until golden brown
4. drain on paper towels
5. serve hot or cold
6. makes 4 to 6 servings.1 1/2 cups flour, 1/4 teaspoon salt, 1 teaspoon baking powder, 2 teaspoons baking soda, 3/4 cup sugar, 1 1/
