**Installing & Importing Required Libraries**

In [None]:
!pip install transformers

In [None]:
import re
import json
import h5py
import tqdm
import numpy as np
import pandas as pd
import joblib as jb
from transformers import GPT2Tokenizer
from sklearn.model_selection import train_test_split

**Mounting Google Drive for importing the Data Files which will be used in the Tokenization**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Importing Updated Recipe json File which contains Recipe Data**

In [None]:
df_new = jb.load('/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/data_v1.pickle')

**Converting Recipe json File into a DataFrame**

In [None]:
df = pd.DataFrame(df_new)

In [None]:
df.head()

**Formatting the Instructions of the Recipe by Performing operations like removing '\t' from the beginning of the instructions, inserting ';' at the end of each instruction, etc**

In [None]:
list_of_instrns = []
for row in range(0,len(df)):
    instr = df.iloc[row]['instructions']
    
    strg = ""
    length = len(instr) - 1
    count = 0
    for instruction in instr:
        processed_instr = []
        for j in range(0,len(instruction)):
            if(instruction[j]=='|' or instruction[j]=='\t'):
                continue
            elif(instruction[j]==' '):
                if(instruction[j-1]!='|'):
                   strg = strg + instruction[j]
            elif(instruction[j] == '.') and (j!=len(instruction)-1) and (instruction[j-1].isdigit()==False):
                strg = strg + ' '
                strg = strg + instruction[j]      
            elif(instruction[j]>='a' and instruction[j]<='z') or (instruction[j]>='A' and instruction[j]<='Z') :
                 strg =  strg + instruction[j].lower()
            elif(instruction[j] == ','):
                  strg =  strg + ' '
                  strg =  strg + ',' 
            elif(instruction[j].isdigit()):
                if(instruction[j+1] == '.') or (instruction[j+2] == '.'):
                    continue
                else:
                    strg = strg + instruction[j]  
                     
        if(count!=length):        
            strg = strg + ' '
            strg = strg + ';' 
            strg = strg + ' '
  
   
        count = count + 1     
          
    processed_instr.append(strg)
    list_of_instrns.append(processed_instr)      

**Deleting the current "instructions" column from the DataFrame and inserting the modified Instructions by Creating the new "instructions" column**

In [None]:
df.drop('instructions', inplace=True, axis=1)

In [None]:
df['instructions'] = list_of_instrns

In [None]:
df.head()

**Dividing whole Data into Train and Test part with the Ratio of Train to Test is 0.96 : 0.04**

In [None]:
train,test = train_test_split(df, train_size=0.96, random_state= 2)

**Displaying the Size of Train and Test Part and Resetting to the Default Index of these portions**

In [None]:
print("Train Portion size is: ",train.shape)
print("Test Portion size is: ",test.shape)

In [None]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

**Defining the function that will be used for Converting the Dataset into Text Data Format so that the the Data can be Tokenize**

In [None]:
def df_to_plaintext_file(input_df, output_file):
    print("Writing to", output_file)
    with open(output_file, 'w', encoding="utf-8") as f:
        for index, row in input_df.iterrows():
            title = row.title
            instructions = row.instructions[0].split('.')[:-1]
            ingredients = row.ingredient_phrase
            keyword = row.ingredients
            
            if index%40000==0:
                print(index)
                print("ingreds --->",ingredients)
                print("keywords --->",keyword)

            res = "<RECIPE_START> <INPUT_START> " + " <NEXT_INPUT> ".join(keyword) + " <INPUT_END> <TITLE_START> " + \
              title + "<TITLE_END> <INGR_START> " + \
              " <NEXT_INGR> ".join(ingredients) + " <INGR_END> <INSTR_START> " + " <NEXT_INSTR> ".join(instructions) + " <INSTR_END> <RECIPE_END>"
            f.write("{}\n".format(res))

**Saving the Processed Train and Test Files to the Custom Path**

In [None]:
df_to_plaintext_file(train, '/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/train_temp.txt')
df_to_plaintext_file(test, '/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/test_temp.txt')

**Initializing the GPT2 Tokenizer and Adding special Tokens defined by us to Define the different parts of the Recipe like its title, constituting ingredeints, etc**

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)
# tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
special_tokens = {
    "additional_special_tokens": ['<RECIPE_START>',
                                  '<INPUT_START>',
                                  '<NEXT_INPUT>',
                                  '<INPUT_END>',
                                  '<INGR_START>',
                                  '<NEXT_INGR>',
                                  '<INGR_END>',
                                  '<INSTR_START>',
                                  '<NEXT_INSTR>',
                                  '<INSTR_END>',
                                  '<TITLE_START>'
                                  ,'<TITLE_END>'
                                  ,'<RECIPE_END>'
    ]
}

tokenizer.add_special_tokens(special_tokens)

end_token_id = tokenizer.convert_tokens_to_ids(["<RECIPE_END>"])[0]

hf = h5py.File("/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/data_temp.h5", "w")
for filename in ["/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/test_temp", "/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/train_temp"]:
    out_np = []
    data = open(filename+".txt", "r")
    num = 0
    rows = 0
    last=[]
    for line in data:
        num+=1
        if num%10000 == 0:
            print("Read "+str(num)+" Written: "+str(rows))

        text_tokens = tokenizer.tokenize(line) 
        # the tokens supported by gpt2 are 1024 for gpt2 medium. so if the recipe is exceeds this length it wont fit in the model and will generate errors. 
        if len(text_tokens) > 1024: 
            continue

        text_tokens_ids = tokenizer.convert_tokens_to_ids(text_tokens)

        if (len(last) + len(text_tokens_ids)) <= 1024:
            last+=text_tokens_ids
        else:
            while len(last) < 1024:
                last.append(end_token_id)
            out_np.append(last)
            last=text_tokens_ids
            rows+=1
    out_mat = np.matrix(out_np)
    print(out_mat.shape)
    hf.create_dataset(filename, data=out_mat)
hf.close()

**Displaying the Final Length of Tokenizer**

In [None]:
len(tokenizer)

**Displaying the Final Number of Recipes Downsampled**

In [None]:
t = []
with open('/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/train_temp.txt') as file1:
    for f in file1:
        t.append(f)

In [None]:
print('No of recipes downsampled for prototyping: ',len(t))