## Question 1 (A)

### Taking reference from
- https://www.geeksforgeeks.org/implementing-web-scraping-python-beautiful-soup/

First of all we need to install all the neccessary library.

In [None]:
!pip install requests 
!pip install html5lib 
!pip install bs4 
!pip install pandas 
!pip install lxml 
!pip install spacy 
!pip install scikit-learn spacy 
!pip install transformers 
!pip install torch 
!pip install matplotlib

# Question 1 (a)

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random

def parse_sitemap(file_path):
    with open(file_path, "r") as fp:
        soup = BeautifulSoup(fp, 'xml')  # Use 'xml' parser for XML files
    return soup.find_all('loc')

def get_recipe_urls(locations):
    urls = []
    for loc in locations:
        if '/recipe/' in loc.text:
            urls.append(loc.text)
    return urls

# List of sitemap files
sitemaps = [
    "C:/Users/adity/Desktop/coding/ComputationalGastronomy(CGAS)/Assignment1/site_map/sitemap_1.xml",
    "C:/Users/adity/Desktop/coding/ComputationalGastronomy(CGAS)/Assignment1/site_map/sitemap_2.xml",
    "C:/Users/adity/Desktop/coding/ComputationalGastronomy(CGAS)/Assignment1/site_map/sitemap_3.xml",
    "C:/Users/adity/Desktop/coding/ComputationalGastronomy(CGAS)/Assignment1/site_map/sitemap_4.xml"
]

urlArray = []
idArray = []

# Process each sitemap
for sitemap in sitemaps:
    locations = parse_sitemap(sitemap)
    urlArray.extend(get_recipe_urls(locations))

# Assign IDs to the URLs
idArray = list(range(1, len(urlArray) + 1))

print(f"Total number of recipe URLs found: {len(urlArray)}")
print(f"First 100 URLs: {urlArray[:100]}")

urlArray = urlArray[:11000] # Taking only 10,000 recipes
print("Taking this many urls : ",len(urlArray))

> In the above code snippet, we are performing web scraping by reading the file and parsing it using **BeautifulSoup** with the xml parser. Here all the tags with <loc> will be 
returned. Assigning unique IDs to each URL by creating a list of integers from 1 to the length of array and then print the total number of recipe URLs found and the first 100 
URLs as a preview. 

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

import time
import random
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Create an empty DataFrame to store results progressively
df = pd.DataFrame(columns=['Recipe Name', 'Recipe URL', 'Ingredients', 'Instructions', 'Prep Time', 'Cook Time', 'Total Time', 'Servings', 'Yield'])


for currUrl in urlArray:
    try:
        r = requests.get(currUrl, headers=headers)  # Using headers
        r.raise_for_status()  # Raise exception for bad status codes
        soup = BeautifulSoup(r.content, 'html5lib')

        # Fetch recipe name
        recipeName = soup.find('h1')
        recipeName = recipeName.text.replace('\n', '') if recipeName else "N/A"

        # Fetch ingredients
        ingredientDiv = soup.find_all('li', attrs={'class': 'mm-recipes-structured-ingredients__list-item'})
        recipeIngredient = []
        for row in ingredientDiv:
            spans = row.find_all('span')
            ingredient = ' '.join(span.text.strip() for span in spans if span.text.strip())
            recipeIngredient.append(ingredient)
        recipeIngredient = ', '.join(recipeIngredient)

        # Fetch instructions
        recipeInstructionDiv = soup.find_all('li', attrs={'class': 'comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI'})
        recipeInstruction = ""  # One paragraph string for the instructions
        for row in recipeInstructionDiv:
            recipeInstruction += row.p.text.replace('\n', '')
        recipeInstruction = recipeInstruction[:-1] if recipeInstruction else "N/A"

        # Get additional recipe details (Prep time, Cook time, etc.)
        details = soup.find('div', class_='mm-recipes-details__content')
        if details:
            prep_time = details.find('div', string='Prep Time:').find_next('div', class_='mm-recipes-details__value').text.strip() if details.find('div', string='Prep Time:') else "N/A"
            cook_time = details.find('div', string='Cook Time:').find_next('div', class_='mm-recipes-details__value').text.strip() if details.find('div', string='Cook Time:') else "N/A"
            total_time = details.find('div', string='Total Time:').find_next('div', class_='mm-recipes-details__value').text.strip() if details.find('div', string='Total Time:') else "N/A"
            servings = details.find('div', string='Servings:').find_next('div', class_='mm-recipes-details__value').text.strip() if details.find('div', string='Servings:') else "N/A"
            yield_value = details.find('div', string='Yield:').find_next('div', class_='mm-recipes-details__value').text.strip() if details.find('div', string='Yield:') else "N/A"
        else:
            prep_time = cook_time = total_time = servings = yield_value = "N/A"

        # Create a DataFrame for the current recipe
        new_row_df = pd.DataFrame([{
            'Recipe Name': recipeName,
            'Recipe URL': currUrl,
            'Ingredients': recipeIngredient,
            'Instructions': recipeInstruction,
            'Prep Time': prep_time,
            'Cook Time': cook_time,
            'Total Time': total_time,
            'Servings': servings,
            'Yield': yield_value
        }])

        # Use pd.concat to append the new row
        df = pd.concat([df, new_row_df], ignore_index=True)

        # Save the DataFrame to CSV after each recipe is processed
        df.to_csv('Q1_a.csv', index=False)

        print(f"Processed: {currUrl}")

        # Adding a time delay between the requests
        # time.sleep(random.uniform(1, 4))

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {currUrl}: {e}")
        continue  # Skip to the next URL if there's an error

print(f"Data saved to recipe_data.csv. Total recipes processed: {len(df)}")

## Question 1 (B)
1. Assigning Recipes Id's already done using ```'ID': range(1, len(urlArray) + 1) ```
2. Fetching non-duplicate ingredient entries. (already non-duplicate).
    - Storing in form of recipeID - Ingredients
3. Using NER(Named entity recognition) to get actual Ingrediant names from phrases.

In [16]:
df = pd.read_csv('Q1_a.csv')

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("davanstrien/deberta-v3-base_fine_tuned_food_ner")

model = AutoModelForTokenClassification.from_pretrained("davanstrien/deberta-v3-base_fine_tuned_food_ner")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

## Giving ID's

In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('Q1_b.csv')

# Create a dictionary to store unique recipe names and their corresponding IDs
recipe_ids = {}
current_id = 1

# Function to get or assign an ID for a recipe
def get_recipe_id(recipe_name):
    global current_id
    if recipe_name not in recipe_ids:
        recipe_ids[recipe_name] = current_id
        current_id += 1
    return recipe_ids[recipe_name]

# Apply the function to create a new 'Numeric Recipe ID' column
df['ID'] = df['Recipe ID'].apply(get_recipe_id)

# Reorder the columns
df = df[['ID','Ingredient Name']]

# Save the result to a new CSV file
df.to_csv('Q1_b.csv', index=False)

print("Numeric Recipe IDs have been assigned and saved to 'recipes_with_numeric_ids.csv'")

The above code reads a CSV file and assigns them a unique numeric IDs. It creates a dictionary to track the IDs for each recipe, which ensures that each unique recipe name has a distinct ID. The ```get_recipe_ID``` function checks whether a recipe name has already been assigned an ID, and if not, assigns the next available ID. The final DataFrame, contains only the ID and Ingredient Name columns, which is then saved back to the CSV file.

In [29]:
def find_ingredients(sentence):
    tokens = tokenizer(sentence, return_tensors="pt").to(device)
    sentence = tokenizer.tokenize(sentence)
    output = model(**tokens).logits
    predicted_token_class_ids = output.argmax(-1)
    predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
    req_ans = []
    for i in range(len(predicted_tokens_classes)):
      if predicted_tokens_classes[i].find('-FOOD') != -1:
        req_str = sentence[i-1].replace('▁', '')
        req_ans.append(req_str)
    return(req_ans)

In [None]:
from tqdm import tqdm

# Identify the ID column (assuming it's the first column if not named 'ID')
id_column = 'ID' if 'ID' in df.columns else df.columns[0]
print(f"Using '{id_column}' as the ID column")

arr_ids = []
arr_ingredient = []

# Get total number of rows for progress reporting
total_rows = len(df)

# Iterate through the DataFrame with progress bar
for index, row in tqdm(df.iterrows(), total=total_rows, desc="Processing recipes"):
    try:
        curr_str = row['Ingredients']
        recipe_id = row[id_column]
        
        arr = curr_str.split(',')
        new_arr = []
        for x in arr:
            temp_arr = find_ingredients(x)
            temp_str = " ".join(temp_arr)
            if temp_str:
                new_arr.append(temp_str.lower())
        
        new_set = set(new_arr)
        for ingredient in new_set:
            arr_ids.append(recipe_id)
            arr_ingredient.append(ingredient)
        
        # Print progress every 100 rows
        if index % 100 == 0 and index > 0:
            print(f"\nProcessed {index} rows. Current recipe ID: {recipe_id}")
            print(f"Extracted ingredients: {', '.join(new_set)[:100]}...")  # Truncate if too long
    
    except KeyError as e:
        print(f"\nError processing row {index}: {e}")
        print(f"Row content: {row}")
        continue

# Print final statistics
print(f"\nProcessing complete. Total recipes processed: {total_rows}")
print(f"Total unique ingredients extracted: {len(set(arr_ingredient))}")

This code checks the ID column and uses ```tqdm``` to display a progress bar during iteration. For each row, the ingredients get comma seperated and adds them to a set so that uniquenss is maintained. For each unique ingredient, the code adds the corresponding recipe ID and ingredient name to separate lists. For Every 100 rows, it prints progress updates, including the current recipe ID and a preview of the extracted ingredients. The loop is of 1 length in order to handle the potential KeyError exceptions & logging any problematic rows. Finally, it prints statistics including the total number of processed recipes and the total number of unique extracted ingredients.

In [24]:
new_dict = {'Recipe ID': arr_ids, 'Ingredient Name': arr_ingredient}

new_df = pd.DataFrame(new_dict).set_index('Recipe ID')
sorted_df = new_df.sort_values(by = 'Recipe ID')
sorted_df.to_csv('Q1_b.csv')

## Q1 (c)

In [32]:
idx = 2
random_100_id = []
random_100_ingredient = []
for indx in range(100):
  idx += 27
  curr_str = df['Ingredient Name'][idx]
  arr = curr_str.split(',')
  new_arr = []
  for x in arr:
    temp_arr = find_ingredients(x)
    temp_str = ""
    for x in temp_arr:
      temp_str += x + ' '
    if len(temp_arr) > 0:
      new_arr.append(temp_str[:-1].lower()) 
  new_set = set(new_arr)
  req_arr = list(new_set)
  for x in req_arr:
    random_100_id.append(df['ID'][idx])
    random_100_ingredient.append(x)

In [33]:
random_100_dict = {'Recipe ID': random_100_id, 'Ingredient Name': random_100_ingredient}
random_100_df = pd.DataFrame(random_100_dict).set_index('Recipe ID')
sorted_random_100_df = random_100_df.sort_values(by = 'Recipe ID')

sorted_random_100_df.to_csv('Q1_b_random_100.csv')

Now we are selecting a sample of 100 rows and processing the ingredients from each selected row, and stores the results. Starting with an ```idx``` of 2, and making increment by 27 in each iteration. For each selected row, we are making comma seperated the Ingredient Name. Unique ingredients are stored in a set to avoid duplicates, and the final unique ingredients are converted back into a list. Finally, append the corresponding recipe ID and each unique ingredient to the random_100_id and random_100_ingredient lists, respectively and repeating this process for 100 iterations, resulting in 100 sampled recipe IDs and their associated unique ingredients.