In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
nltk.download('punkt')
import random
import os
import re
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
# Assuming the files are named consistently and located in the directories provided,
# let's first define the file paths for each cuisine's data file.
# Please note that you will need to adjust the paths according to your actual directory structure on Google Colab.

# Define the paths to the directories containing the CSV files
output_df_dir = '/content/drive/MyDrive/BTP_Dev/Output_DF'
final_df_dir = '/content/drive/MyDrive/BTP_Dev/Final_DF'

# List the filenames based on the provided screenshot
file_names = [
    'NovelRecipesGenerated_CANADIAN.csv',
    'NovelRecipesGenerated_SOUTH AMERICAN.csv',
    'NovelRecipesGenerated_INDIAN SUBCONTINENT.csv',
    'NovelRecipesGenerated_MEXICAN.csv',
    'NovelRecipesGenerated_ITALIAN.csv'
]

# Initialize an empty dataframe to combine all the recipes
combined_recipes = pd.DataFrame()

# Loop through each file and append its contents to the combined dataframe
for file_name in file_names:
    # Construct the full file path for the output and final directories
    output_file_path = os.path.join(output_df_dir, file_name)
    final_file_path = os.path.join(final_df_dir, file_name)

    # Check if the file exists in the output directory
    if os.path.isfile(output_file_path):
        # Read the CSV file and combine
        recipes_df = pd.read_csv(output_file_path)
        combined_recipes = combined_recipes.append(recipes_df, ignore_index=True)
    # Check if the file exists in the final directory
    elif os.path.isfile(final_file_path):
        # Read the CSV file and combine
        recipes_df = pd.read_csv(final_file_path)
        combined_recipes = combined_recipes.append(recipes_df, ignore_index=True)
    else:
        print(f"File not found: {file_name}")

# Now let's preprocess the data to make it human-readable
# We will define a function to clean up and structure the recipe text

def clean_recipe(row):
    # Combining the 'Random Ingredients', 'Recipe Title', 'Ingredient Phrases', 'Recipe Instructions' into a formatted string
    formatted_recipe = f"Title: {row['Recipe Title']}\n\n"
    formatted_recipe += "Ingredients:\n"
    if not pd.isna(row['Random Ingredients']):
        ingredients = row['Random Ingredients'].split(';')
        for ingredient in ingredients:
            if ingredient.strip():
                formatted_recipe += f"- {ingredient.strip().capitalize()}\n"
    if not pd.isna(row['Ingredient Phrases']):
        ingredient_phrases = row['Ingredient Phrases'].split('|')
        for phrase in ingredient_phrases:
            if phrase.strip():
                formatted_recipe += f"- {phrase.strip().capitalize()}\n"
    formatted_recipe += "\nInstructions:\n"
    if not pd.isna(row['Recipe Instructions']):
        instructions = row['Recipe Instructions'].split('.')
        for instruction in instructions:
            if instruction.strip():
                formatted_recipe += f"{instruction.strip().capitalize()}.\n"
    return formatted_recipe

# Apply the cleaning function to each row in the dataframe
combined_recipes['Formatted Recipe'] = combined_recipes.apply(clean_recipe, axis=1)

# Show the cleaned and formatted recipes
combined_recipes['Formatted Recipe'].head()



  combined_recipes = combined_recipes.append(recipes_df, ignore_index=True)
  combined_recipes = combined_recipes.append(recipes_df, ignore_index=True)
  combined_recipes = combined_recipes.append(recipes_df, ignore_index=True)
  combined_recipes = combined_recipes.append(recipes_df, ignore_index=True)
  combined_recipes = combined_recipes.append(recipes_df, ignore_index=True)


0    Title:  Barley And Chocolate Biscotti \n\n\nIn...
1    Title:  Hot and Spicy Creamy Corn Dip \n\n\nIn...
2    Title:  Wild Rice Paneer \n\n\nIngredients:\n-...
3    Title:  Smoked Bacon Penne \n\n\nIngredients:\...
4    Title:  Mexican Orzo for Two \n\n\nIngredients...
Name: Formatted Recipe, dtype: object

In [5]:
combined_recipes

Unnamed: 0,Random Ingredients,Recipe Title,Ingredient Phrases,Recipe Instructions,Formatted Recipe
0,"milk chocolate chip,parmesan cheese,vegetable,...",Barley And Chocolate Biscotti \n,1 1/2 cups uncooked long-grain barley | 2 ...,bring a large pot of lightly salted water to...,Title: Barley And Chocolate Biscotti \n\n\nIn...
1,"cumin powder,cream corn,garlic red chile paste...",Hot and Spicy Creamy Corn Dip \n,"32 ounces cream-style corn, drained | 1/4 ...",mix the dried corn with the cream cheese and...,Title: Hot and Spicy Creamy Corn Dip \n\n\nIn...
2,"pizza dough,savory,wild rice,vanilla essence,b...",Wild Rice Paneer \n,1 large raw wild rice | 1 package cooked b...,brown rice in microwave . drain and rins...,Title: Wild Rice Paneer \n\n\nIngredients:\n-...
3,"penne,pepper flake;",Smoked Bacon Penne \n,"1 lb smoked bacon, chopped | 1/2 teaspoon ...",heat frying pan to 350f . place chopped ...,Title: Smoked Bacon Penne \n\n\nIngredients:\...
4,"cheddar cheese,four cheese mexican,nori,grain ...",Mexican Orzo for Two \n,2 lbs uncooked long grain white rice | 1/2...,"cook rice in boiling, salted water according...",Title: Mexican Orzo for Two \n\n\nIngredients...
5,"raspberry syrup,olive oil,adobo sauce,black be...",Easy Crockpot Mexican Rice & Beans \n,4 cups long grain rice | 16 ounces black b...,heat a medium saucepan over medium heat . ...,Title: Easy Crockpot Mexican Rice & Beans \n\...
6,"dark sesame oil,ricotta cheese,dijon mustard,l...",Baked Lobster Bites \n,500 g lobster tails | 125 g ricotta cheese...,preheat oven to 200c degrees . make a we...,Title: Baked Lobster Bites \n\n\nIngredients:...
7,"radish,beef roast,pork,baby carrot,salmon fillet;",Italian Salmon Filet \n,3 ribs roast | 1 cup cubed pork breast or ...,lightly salt and pepper the meat . place...,Title: Italian Salmon Filet \n\n\nIngredients...
8,"golden syrup,grain rice;",Quick Breakfast Burritos \n,3 tablespoons golden syrup | 3 cups uncook...,put rice in a steamer basket and bring water...,Title: Quick Breakfast Burritos \n\n\nIngredi...
9,"white chocolate chip,bamboo skewer;",Chocolate Bamboo Skewer for Diabetic \n\n,1 1/2 lbs bamboo skewer | 1/4 cup white ch...,tie the skewer with a bamboo skewer and heat...,Title: Chocolate Bamboo Skewer for Diabetic \...


In [8]:
combined_recipes.to_csv('/content/combined_recipes.csv')

In [None]:
def read_and_preprocess(file_path):
    """Read and preprocess the data from a file, preserving special tokens."""
    with open(file_path, 'r') as file:
        data = file.readlines()
    return data

def sample_data(data, num_samples=50):
    """Sample a specific number of data points from the data."""
    return random.sample(data, min(num_samples, len(data)))

def calculate_bleu_scores(candidates, references):
    """Calculate BLEU scores for each set of candidates against the references."""
    tokenized_candidates = [nltk.word_tokenize(c.lower()) for c in candidates]
    tokenized_references = [[nltk.word_tokenize(ref.lower())] for ref in references]
    return corpus_bleu(tokenized_references, tokenized_candidates)

# Reading and preprocessing the training data
train_file = '/content/drive/MyDrive/BTP_Dev/Dataset/train_temp.txt'
train_data = read_and_preprocess(train_file)

# Paths to the generated recipe files
generated_files = [
    '/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_CHINESE_processed.txt',
    '/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_INDIAN_processed.txt',
    '/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_ITALIAN_processed.txt',
    '/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_MEXICAN_processed.txt',
    '/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_SOUTHERN_processed.txt'
]

# Sampling and calculating BLEU scores
bleu_scores = {}
for file_path in generated_files:
    generated_data = read_and_preprocess(file_path)
    sampled_train_data = sample_data(train_data)
    sampled_generated_data = sample_data(generated_data)
    score = calculate_bleu_scores(sampled_generated_data, sampled_train_data)
    bleu_scores[file_path] = score

# Printing BLEU scores
for file, score in bleu_scores.items():
    print(f"{file}: BLEU Score = {score}")

/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_CHINESE_processed.txt: BLEU Score = 4.621967646086874e-12
/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_INDIAN_processed.txt: BLEU Score = 1.8546376311654376e-11
/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_ITALIAN_processed.txt: BLEU Score = 2.6844903311191096e-12
/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_MEXICAN_processed.txt: BLEU Score = 8.975555254625472e-09
/content/drive/MyDrive/BTP_Dev/Output_DF/NovelRecipesGenerated_SOUTHERN_processed.txt: BLEU Score = 1.1008823788114727e-18
