**Installing & Importing Required Libraries**

In [1]:
!pip install transformers
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [51]:
import re
import json
import h5py
import tqdm
import numpy as np
import pandas as pd
import joblib as jb
from transformers import GPT2Tokenizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification
import torch
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
import pickle

**Mounting Google Drive for importing the Data Files which will be used in the Tokenization**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Importing Updated Recipe json File which contains Recipe Data**

In [4]:
df_new = jb.load('/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/data_v1.pickle')

**Converting Recipe json File into a DataFrame**

In [5]:
df = pd.DataFrame(df_new)

In [6]:
df.head()

Unnamed: 0,ID,title,ingredients,ingredient_phrase,continent,region,sub_region,instructions
0,2610,Egyptian Lentil Soup,"[carrot, sea salt, water, red lentil, rom toma...","[3 cups water, 1 cup red lentils, 1 roma tomat...",African,Middle Eastern,Egyptian,"[Place 3 cups water, lentils, tomato, carrot, ..."
1,2611,Egyptian Green Beans with Carrots,"[tomato paste, bay leaf, salt black pepper, ve...","[1 tablespoon vegetable oil, 1 large onion , d...",African,Middle Eastern,Egyptian,"[Heat oil in a pot over medium heat., Cook and..."
2,2612,Egyptian Bamia,"[onion, water, salt black pepper, okra, olive ...","[1/4 cup olive oil, 1 large onion , finely cho...",African,Middle Eastern,Egyptian,[Heat olive oil in a large saucepan over mediu...
3,2613,Magpie's Easy Falafel Cakes,"[cream, sauce, cumin, olive oil, cornmeal, sal...","[1/2 small onion , minced, 1 1/2 teaspoons oli...",African,Middle Eastern,Egyptian,[Cook the onions in 1 1/2 teaspoons of olive o...
4,2614,Dukkah,"[sea salt, cumin seed, hazelnut, coriander see...","[2/3 cup hazelnuts, 1/2 cup sesame seeds, 2 ta...",African,Middle Eastern,Egyptian,[Preheat the oven to 350 degrees f. Place the ...


**Formatting the Instructions of the Recipe by Performing operations like removing '\t' from the beginning of the instructions, inserting ';' at the end of each instruction, etc**

In [7]:
list_of_instrns = []
for row in range(0,len(df)):
    instr = df.iloc[row]['instructions']

    strg = ""
    length = len(instr) - 1
    count = 0
    for instruction in instr:
        processed_instr = []
        for j in range(0,len(instruction)):
            if(instruction[j]=='|' or instruction[j]=='\t'):
                continue
            elif(instruction[j]==' '):
                if(instruction[j-1]!='|'):
                   strg = strg + instruction[j]
            elif(instruction[j] == '.') and (j!=len(instruction)-1) and (instruction[j-1].isdigit()==False):
                strg = strg + ' '
                strg = strg + instruction[j]
            elif(instruction[j]>='a' and instruction[j]<='z') or (instruction[j]>='A' and instruction[j]<='Z') :
                 strg =  strg + instruction[j].lower()
            elif(instruction[j] == ','):
                  strg =  strg + ' '
                  strg =  strg + ','
            elif(instruction[j].isdigit()):
                if(instruction[j+1] == '.') or (instruction[j+2] == '.'):
                    continue
                else:
                    strg = strg + instruction[j]

        if(count!=length):
            strg = strg + ' '
            strg = strg + ';'
            strg = strg + ' '


        count = count + 1

    processed_instr.append(strg)
    list_of_instrns.append(processed_instr)

**Deleting the current "instructions" column from the DataFrame and inserting the modified Instructions by Creating the new "instructions" column**

In [8]:
df.drop('instructions', inplace=True, axis=1)

In [9]:
df['instructions'] = list_of_instrns

In [10]:
df.head()

Unnamed: 0,ID,title,ingredients,ingredient_phrase,continent,region,sub_region,instructions
0,2610,Egyptian Lentil Soup,"[carrot, sea salt, water, red lentil, rom toma...","[3 cups water, 1 cup red lentils, 1 roma tomat...",African,Middle Eastern,Egyptian,"[place 3 cups water , lentils , tomato , carro..."
1,2611,Egyptian Green Beans with Carrots,"[tomato paste, bay leaf, salt black pepper, ve...","[1 tablespoon vegetable oil, 1 large onion , d...",African,Middle Eastern,Egyptian,[heat oil in a pot over medium heat ; cook and...
2,2612,Egyptian Bamia,"[onion, water, salt black pepper, okra, olive ...","[1/4 cup olive oil, 1 large onion , finely cho...",African,Middle Eastern,Egyptian,[heat olive oil in a large saucepan over mediu...
3,2613,Magpie's Easy Falafel Cakes,"[cream, sauce, cumin, olive oil, cornmeal, sal...","[1/2 small onion , minced, 1 1/2 teaspoons oli...",African,Middle Eastern,Egyptian,[cook the onions in 1 12 teaspoons of olive oi...
4,2614,Dukkah,"[sea salt, cumin seed, hazelnut, coriander see...","[2/3 cup hazelnuts, 1/2 cup sesame seeds, 2 ta...",African,Middle Eastern,Egyptian,[preheat the oven to 350 degrees f . place the...


In [11]:
df_top5 = df[df['region'].isin(['Italian', 'Mexican', 'South American', 'Canadian', 'Indian Subcontinent'])]

In [12]:
# Counting occurrences of each region
region_counts = df_top5['region'].value_counts()

region_counts

Italian                16574
Mexican                14447
South American          7171
Canadian                6694
Indian Subcontinent     6463
Name: region, dtype: int64

In [13]:
# Number of recipes to sample from each region
sample_size = 6463

# Sample the data
df_balanced = pd.DataFrame()

for region in ['Italian', 'Mexican', 'South American', 'Canadian', 'Indian Subcontinent']:
    df_subset = df[df['region'] == region].sample(n=sample_size, random_state=1)
    df_balanced = pd.concat([df_balanced, df_subset])

# Now df_balanced contains an equal number of recipes from each region
df_balanced

Unnamed: 0,ID,title,ingredients,ingredient_phrase,continent,region,sub_region,instructions
101426,132525,Pepperoni Pizza Twist,"[olive, flour, spinach, garlic clove, egg whit...","[1 package pepperoni , diced, 1 can ripe olive...",European,Italian,Italian,[preheat oven to 375f . mix first seven items ...
92487,123581,Balsamic Fig Vinegar,"[sugar, fig, balsamic vinegar]","[1 -2 cup fresh fig, 1 quart balsamic vinegar,...",European,Italian,Italian,"[bring figs , vinegar , and sugar to a simmer ..."
7874,10491,Sweet and Sour Sicilian Tuna,"[red wine vinegar, sugar, olive oil, tuna stea...","[3 tablespoons olive oil, 1 large onion , cut ...",European,Italian,Italian,[heat olive oil in a heavy bottomed skillet ov...
94837,125932,Darcy's Pasta E Fagioli,"[pancetta, water, garlic clove, cannellini bea...","[1 tablespoon olive oil, 3 ounces pancetta , f...",European,Italian,Italian,[heat the oil in a large pot over medium high ...
94816,125911,Penne a La Vodka (With a Little Kick!),"[garlic clove, red pepper flake, vodka, tomato...","[16 ounces penne pasta, 5 tablespoons butter, ...",European,Italian,Italian,"[cook pasta until al dente , drain and reserve..."
...,...,...,...,...,...,...,...,...
32805,63852,Curried Lentils and Rice,"[spinach, mint, ginger, basmati rice, curry po...","[1/2 cup plain yogurt, 1/4 cup chopped of fres...",Asian,Indian Subcontinent,Indian,[mix mint and yogurt and refrigerate . wash ri...
37092,68140,Indian Curry Soup,"[ginger paste, water, turmeric, gmstoovar dal,...","[50 gmstoovar dal, 1 teaspoon ginger paste, 1 ...",Asian,Indian Subcontinent,Indian,[pressure cook the dal well . melt butter in a...
35880,66928,Pudina Ki Chutney,"[mint leaf, sugar, salt, green chilies, onion,...","[150 g fresh mint leaves , discard the stalks,...",Asian,Indian Subcontinent,Indian,[wash mint leaves and combine all the ingredie...
1612,4222,Egg Kulambu,"[water, coriander, fennel seed, turmeric, tama...","[8 eggs, 1/2 teaspoon fennel seeds, 1/2 teaspo...",Asian,Indian Subcontinent,Indian,[place the eggs into a saucepan in a single la...


In [14]:
# Specify your desired file path
file_path = '/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/df_balanced.csv'

# Save the DataFrame
df_balanced.to_csv(file_path, index=False)

**Dividing whole Data into Train and Test part with the Ratio of Train to Test is 0.96 : 0.04**

In [15]:
train,test = train_test_split(df_balanced, train_size=0.96, random_state= 2)

**Displaying the Size of Train and Test Part and Resetting to the Default Index of these portions**

In [16]:
print("Train Portion size is: ",train.shape)
print("Test Portion size is: ",test.shape)

Train Portion size is:  (31022, 8)
Test Portion size is:  (1293, 8)


In [17]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

**Defining the function that will be used for Converting the Dataset into Text Data Format so that the the Data can be Tokenize**

In [18]:
def df_to_plaintext_file(input_df, output_file):
    print("Writing to", output_file)
    with open(output_file, 'w', encoding="utf-8") as f:
        for index, row in input_df.iterrows():
            title = row.title
            instructions = row.instructions[0].split('.')[:-1]
            ingredients = row.ingredient_phrase
            keyword = row.ingredients

            if index%40000==0:
                print(index)
                print("ingreds --->",ingredients)
                print("keywords --->",keyword)

            res = "<RECIPE_START> <INPUT_START> " + " <NEXT_INPUT> ".join(keyword) + " <INPUT_END> <TITLE_START> " + \
              title + "<TITLE_END> <INGR_START> " + \
              " <NEXT_INGR> ".join(ingredients) + " <INGR_END> <INSTR_START> " + " <NEXT_INSTR> ".join(instructions) + " <INSTR_END> <RECIPE_END>"
            f.write("{}\n".format(res))

In [19]:
def bert_preprocess(row):
    combined_text = row['title'] + ' ' + ' '.join(row['ingredients']) + ' ' + ' '.join(row['instructions'])
    return combined_text

**Saving the Processed Train and Test Files to the Custom Path**

In [20]:
df_to_plaintext_file(train, '/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/train_temp_top5Balanced.txt')
df_to_plaintext_file(test, '/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/test_temp_top5Balanced.txt')

Writing to /content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/train_temp_top5Balanced.txt
0
ingreds ---> ['500 g lemons', '4 tablespoons salt', '150 g sugar', '1 pinch hing', '1 tablespoon red chili powder', '1 tablespoon garam masala']
keywords ---> ['garam masala', 'red chili powder', 'lemon', 'sugar', 'salt', 'hing']
Writing to /content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/test_temp_top5Balanced.txt
0
ingreds ---> ['3 guajillo chilies , stemmed and seeded', '1 ancho chili , stemmed and seeded', '1 medium white onion , peeled and halved', '4 garlic cloves , peeled', '4 medium tomatillos , husked and rinsed', '1 tomatoes , halved ( either green or red tomatoes will do )', '1/4 cup extra virgin olive oil', '1 teaspoon whole black peppercorn', '5 whole cloves', '2 tablespoons lard', '2 tablespoons masa harina or 2 tablespoons cornmeal']
keywords ---> ['guajillo chilies', 'extra virgi

**Initializing the GPT2 Tokenizer and Adding special Tokens defined by us to Define the different parts of the Recipe like its title, constituting ingredeints, etc**

In [21]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)
# tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
special_tokens = {
    "additional_special_tokens": ['<RECIPE_START>',
                                  '<INPUT_START>',
                                  '<NEXT_INPUT>',
                                  '<INPUT_END>',
                                  '<INGR_START>',
                                  '<NEXT_INGR>',
                                  '<INGR_END>',
                                  '<INSTR_START>',
                                  '<NEXT_INSTR>',
                                  '<INSTR_END>',
                                  '<TITLE_START>'
                                  ,'<TITLE_END>'
                                  ,'<RECIPE_END>'
    ]
}

tokenizer.add_special_tokens(special_tokens)

end_token_id = tokenizer.convert_tokens_to_ids(["<RECIPE_END>"])[0]

hf = h5py.File("/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/data_temp.h5", "w")
for filename in ["/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/test_temp_top5Balanced", "/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/train_temp_top5Balanced"]:
    out_np = []
    data = open(filename+".txt", "r")
    num = 0
    rows = 0
    last=[]
    for line in data:
        num+=1
        if num%10000 == 0:
            print("Read "+str(num)+" Written: "+str(rows))

        text_tokens = tokenizer.tokenize(line)
        # the tokens supported by gpt2 are 1024 for gpt2 medium. so if the recipe is exceeds this length it wont fit in the model and will generate errors.
        if len(text_tokens) > 1024:
            continue

        text_tokens_ids = tokenizer.convert_tokens_to_ids(text_tokens)

        if (len(last) + len(text_tokens_ids)) <= 1024:
            last+=text_tokens_ids
        else:
            while len(last) < 1024:
                last.append(end_token_id)
            out_np.append(last)
            last=text_tokens_ids
            rows+=1
    out_mat = np.matrix(out_np)
    print(out_mat.shape)
    hf.create_dataset(filename, data=out_mat)
hf.close()

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

(447, 1024)
Read 10000 Written: 3523
Read 20000 Written: 7074
Read 30000 Written: 10602
(10966, 1024)


**Displaying the Final Length of Tokenizer**

In [22]:
len(tokenizer)

50270

### BERT Embeddings

In [23]:
df_balanced['combined_text'] = df_balanced.apply(bert_preprocess, axis=1)

In [24]:
label_encoder = LabelEncoder()
df_balanced['region_labels'] = label_encoder.fit_transform(df_balanced['region'])

In [25]:
tokenizerBert = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [26]:
def tokenize_data(text):
    return tokenizerBert.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        return_attention_mask=True,
        truncation=True
    )

df_balanced['tokenized_data'] = df_balanced['combined_text'].apply(tokenize_data)

In [27]:
file_path_csv = '/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/df_top5Balanced.csv'
df_balanced.to_csv(file_path_csv, index=False)

In [28]:
# Extracting the relevant data for the dataset
input_ids = df_balanced['tokenized_data'].apply(lambda x: x['input_ids'])
attention_masks = df_balanced['tokenized_data'].apply(lambda x: x['attention_mask'])

# Converting to lists
input_ids = [torch.tensor(ids[0]) for ids in input_ids]
attention_masks = [torch.tensor(mask[0]) for mask in attention_masks]
labels = torch.tensor(df_balanced['region_labels'].values)

In [29]:
class RecipeDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx].unsqueeze(0),  # Add batch dimension
            'attention_mask': self.attention_masks[idx].unsqueeze(0),  # Add batch dimension
            'labels': self.labels[idx].unsqueeze(0)  # Add batch dimension
        }

In [30]:
train_dataset = RecipeDataset(input_ids, attention_masks, labels)

In [31]:
num_labels = len(df_balanced['region'].unique())

In [32]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
!pip install transformers[torch]
!pip install accelerate -U



In [34]:
# Check and use GPU if available
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
    model.to(device)
else:
    print("No GPU available, using CPU instead.")
    device = torch.device("cpu")

GPU available: Tesla T4


In [35]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Models/bert_fine_tuned_top5balanced',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,  # Regularization
    save_strategy="no",  # Disable saving checkpoints
    logging_strategy="no",  # Disable logging
    logging_dir='/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Models/bert_fine_tuned_top5balanced_logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)


In [36]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=8080, training_loss=1.6140326471612005, metrics={'train_runtime': 663.8522, 'train_samples_per_second': 97.356, 'train_steps_per_second': 12.171, 'total_flos': 33213526459860.0, 'train_loss': 1.6140326471612005, 'epoch': 2.0})

In [37]:
model.save_pretrained('/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Models/bert_fine_tuned_top5_balanced')
torch.save(model, '/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Models/bert_fine_tuned/entire_model_top5_balanced.pth')

In [38]:
vector_size = model.config.hidden_size
print(f"The vector size of the model is: {vector_size}")

The vector size of the model is: 768


In [53]:
# Number of recipes per batch
batch_size = 100  # Adjust as needed

# Divide the DataFrame into batches
batches = np.array_split(df_balanced, len(df_balanced) // batch_size)

# Attach the hook to the model's specific layer (e.g., last layer)
handle = model.bert.encoder.layer[-1].output.dense.register_forward_hook(hook_function)

# Process each batch
for batch_idx, batch in enumerate(batches):
    recipe_embeddings = {}

    # Reset the embeddings list for each batch
    embeddings = []

    # Forward pass for each recipe in the batch
    for idx, row in batch.iterrows():
        inputs = tokenizerBert(row['combined_text'], return_tensors='pt', truncation=True, padding='max_length', max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            model(**inputs)

        # Extract the output from the last layer
        last_layer_output = embeddings[-1]
        embedding = last_layer_output.mean(dim=1).cpu().numpy()
        recipe_embeddings[row['ID']] = embedding

    batch_file_path = f'/content/drive/MyDrive/BTP: Novel Recipe Generation (Constraint Optimization) 2023/Data/Preprocessed/embeddings_batch_{batch_idx}.pkl'
    with open(batch_file_path, 'wb') as f:
        pickle.dump(recipe_embeddings, f)

    # Clear the memory
    del recipe_embeddings

# Don't forget to remove the hook after processing all batches
handle.remove()

**Displaying the Final Number of Recipes Downsampled**

In [42]:
t = []
with open('/content/drive/MyDrive/BTP (personal)/Data/Preprocessed/train_temp.txt') as file1:
    for f in file1:
        t.append(f)

In [43]:
print('No of recipes downsampled for prototyping: ',len(t))

No of recipes downsampled for prototyping:  113359
