**Downloading, Installing & Importing Required Libraries**

In [None]:
# Packages and Libraries Required for training the model and working with the dataset
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import files
import matplotlib.pyplot as plt

# Packages and Libraries Required for implementing Utility/helper functions.
import os
import gc
import re
import csv
import sys
import h5py
import nltk
import math
import json
import glob
import time
import torch
import shutil
import pickle
import string
import random
import pickle
import zipfile
import pathlib
import logging
import argparse
import platform
import itertools
import seaborn as sns
import tensorflow as tf
from matplotlib import pyplot
from tqdm import tqdm, trange
from nltk.stem.porter import *
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from nltk.stem import PorterStemmer
from sklearn.decomposition import PCA
from transformers import GPT2Tokenizer
from keras.layers import Bidirectional
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import SpectralClustering
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)

In [None]:
!pip install transformers

In [None]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')

**Mounting Google Drive for importing the Data Files which will be used in the Tokenization**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Initializing the WordNet Lemmatizer**

In [None]:
lemmatizer = WordNetLemmatizer() 

**Importing csv File which contains Unique Recipe IDs**

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/RecipeDB_v1/Recipe_correct_ndb_updated_v1.csv')
df

**Fetching Unique Recipe IDs**

In [None]:
recipeIds=list(df['recipe_no'].unique())
print(recipeIds)
recipeIdslistStringForm=list()
for eachRecipeId in recipeIds:
  recipeIdslistStringForm.append(str(eachRecipeId))

**Displaying the Number of Unique Recipe Ids**

In [None]:
print("Number of Unique Recipe Ids:",len(recipeIdslistStringForm))

**Importing json File which contains Recipe Ids with thier Instructions**

In [None]:
with open("/content/drive/MyDrive/Final_Model_Rata2_Recipegen/recipe_db_data.json") as data_file:
    data = json.load(data_file)

**Removing Recipe IDs from json data which are Not Present in Unique Recipe Ids csv file**

In [None]:
for x,y in enumerate(data):
  if y['Recipe_id'] not in recipeIdslistStringForm:
    data.pop(x)

**Saving the Generated json file which Contains same Unique Recipes**

In [None]:
with open("data_v1.json", "w") as final:
    json.dump(data, final)
files.download('data_v1.json')

**Pre-Processing Steps Start from Here**

**Opening the json file that was Dumped above**

In [None]:
f = open('/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/data_v1.json')
data_new = json.load(f)
f.close()

**Displaying the Total Number of Recipes**

In [None]:
print("Total Number of Recipes are: ",len(data_new))

**Defining the function to perform Ingredients Merging to the corresponding Recipe IDs and their instruction**

In [None]:
def load_dataset(ingredients_path,steps_path, title_path):
    print("Loading all required files..\n")
    df_titles = pd.read_csv(title_path)
    ingredients = pd.read_csv(ingredients_path)
    with open(steps_path) as json_file: 
        steps = json.load(json_file) 

    print("\n\nCreating steps dict..\n")
    steps_dic = {}
    for dic in steps:
        steps_dic[int(dic['Recipe_id'])] = dic['steps'].split(';')

    print("\n\nCreating title dict..\n")
    recipe_ids = []
    recipe_ids = df_titles['Recipe_id'].tolist()
    titles = df_titles['Recipe_title'].tolist()
    continents = df_titles['Continent'].tolist()
    regions = df_titles['Region'].tolist()
    sub_region = df_titles['Sub_region'].tolist()
    title_dic = {}
    continet_dict = {}
    region_dict = {}
    sub_region_dict = {}

    for i in range(len(titles)):
      if recipe_ids[i] not in title_dic:
        title_dic[recipe_ids[i]]=titles[i]
        continet_dict[recipe_ids[i]] = continents[i]
        region_dict[recipe_ids[i]] = regions[i]
        sub_region_dict[recipe_ids[i]] = sub_region[i]
    
    print("\n\nCreating ingredients dict..\n")
    recipe_ids = []
    recipe_ids = ingredients['recipe_no'].tolist()
    ing = ingredients['ingredient'].tolist()
    ing_phrase = ingredients['ingredient_Phrase'].tolist()

    ingredient_dic = {}
    for i in range(len(recipe_ids)):
        ingredient_dic[recipe_ids[i]] = []
    for i in range(len(ing)):
        if str(ing[i]) != 'nan':
            ingredient_dic[recipe_ids[i]].append(ing[i])
    
    ing_phrase_dic = {}
    for i in range(len(recipe_ids)):
        ing_phrase_dic[recipe_ids[i]] = []
    for i in range(len(ing_phrase)):
        if str(ing_phrase[i]) != 'nan':
            ing_phrase_dic[recipe_ids[i]].append(ing_phrase[i])

    print("\nCreating data and validating..\n")
    dataset = []
    recipe_ids =  list(set(ingredients['recipe_no'].tolist()))
    
    for i in recipe_ids:
      recipe = {}
      recipe['ID'] = i
      try:
        recipe['title'] = title_dic[i]
        recipe['ingredients'] = ingredient_dic[i]
        recipe['ingredient_phrase'] = ing_phrase_dic[i]
        recipe['continent'] = continet_dict[i]
        recipe['region'] = region_dict[i]
        recipe['sub_region'] = sub_region_dict[i]
        recipe['instructions'] = steps_dic[i]

      except KeyError:
        continue
        
      if len(recipe['title']) != 0 and len(recipe['instructions']) != 0 and len(recipe['ingredients']) != 0:
          dataset.append(recipe)
    
    print("\n COMPLETED")

    return dataset

**Defining the Path of the Files required for performing the Merging Operation**

In [None]:
steps_path = '/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/data_v1.json'
titles_path = '/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/Recipes(6).csv'
ingredients_path = '/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/RecipeDB_v1/Recipe_correct_ndb_updated_v1.csv'

**Calling the above method for Performing the corresponding operation**

In [None]:
data = load_dataset(ingredients_path,steps_path, titles_path)

**Displaying the Single Recipe Data to Analyze and Decide what Pre-Processing steps should be applied on the Recipes Data**

In [None]:
data[0]

**Defining Pre-Processing Methods and Applying them on the Recipe Data**

**Method to Lemmatize the Ingredients of the Recipe**

In [None]:
def clean_ingredients(l):
  l = [ele.lower() for ele in l]
  l = [ lemmatizer.lemmatize(ele) for ele in l]
  l =  set(l)
  l = list(l)
  return l

for i in range(len(data)):
  ing = data[i]['ingredients']
  ing_fix = clean_ingredients(ing)
  data[i]['ingredients'] = ing_fix 

**Method to Fix the Punctuation of the Instructions of the Recipes**

In [None]:
def fix_punctuation(l):
  newl = []
  for i in range(len(l)):
    x = re.sub(r'\s([?.!",](?:\s|$))', r'\1', l[i])
    newl.append(x)
  return newl

punc_fix_data = []
for i in range(len(data)):
  ins = data[i]['instructions']
  ins_fix = fix_punctuation(ins)
  data[i]['instructions'] = ins_fix 

**Method to Capitalize and Removing the Extra space at start of the Instructions of the Recipe**

In [None]:
p = re.compile(r'((?<=[\.\?!]\s)(\w+)|(^\w+)|(^\w*))')
def cap(match):
    return(match.group().capitalize())

def fix_caps(l):
  newl = []
  for i in range(len(l)):
    a = l[i].lstrip()
    y = p.sub(cap, a)
    newl.append(y)
  return newl


for i in range(len(data)):
  ins = data[i]['instructions']
  ins_caps = fix_caps(ins)
  data[i]['instructions'] = ins_caps

**Displaying the Single Recipe Data after Applying the above Pre-Processing Methods**

In [None]:
data[0]

**Placing '.' (full stop) after instructions which do not end with the '.' (full stop)**

In [None]:
for i in range(len(data)):
  ins = data[i]['instructions']
  for j in range(len(ins)):
    if(ins[j].endswith('.')):
      continue
    else:
      ins[j] = ins[j].rstrip()
      ins[j]=ins[j]+"."

**Saving the Final Pre-Processed File**

In [None]:
with open('/content/drive/MyDrive/Monsoon22_conditional_recipe_gen/data_v1.pickle', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)