Import all libraries for data work.

In [None]:
import json
import pandas as pd

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## **USDA FOOD MACRO DATA WORK.**

Check the dataset for missing macro values.

In [None]:
# Load the JSON file
json_file_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/foundationDownload 2.json'
with open(json_file_path, 'r') as f:
    raw_data = json.load(f)

# Function to check if a nutrient value is missing for a given food ID
def check_missing_nutrients(food_id, nutrient_ids):
    for food_item in raw_data['FoundationFoods']:
        if food_item['fdcId'] == food_id:
            food_nutrients = food_item['foodNutrients']
            missing_nutrients = {}
            for nutrient_name, nutrient_id in nutrient_ids.items():
                if isinstance(nutrient_id, dict):
                    for sub_nutrient_name, sub_nutrient_id in nutrient_id.items():
                        if not any(nutrient['nutrient']['id'] == sub_nutrient_id for nutrient in food_nutrients):
                            missing_nutrients[sub_nutrient_name] = True
                        else:
                            missing_nutrients[sub_nutrient_name] = False
                else:
                    if not any(nutrient['nutrient']['id'] == nutrient_id for nutrient in food_nutrients):
                        missing_nutrients[nutrient_name] = True
                    else:
                        missing_nutrients[nutrient_name] = False
            return missing_nutrients
    return None

# Nutrient IDs as used in the previous script
nutrient_ids = {
    'kcal': 1008,
    'protein': 1003,
    'carbohydrate': 1005,
    'fiber': 1079,
    'total_fat': 1004,
    'types_of_fat': {
        'saturated_fat': 1258,
        'monounsaturated_fat': 1292,
        'polyunsaturated_fat': 1293,
        'trans_fat': 1257,
    }
}

# Check missing nutrients for sample food IDs
sample_food_ids = [748323]
missing_nutrients_info = {food_id: check_missing_nutrients(food_id, nutrient_ids) for food_id in sample_food_ids}

# Print the missing nutrient information
for food_id, missing_info in missing_nutrients_info.items():
    print(f"Food ID {food_id}:")
    for nutrient, is_missing in missing_info.items():
        print(f"  {nutrient}: {'Missing' if is_missing else 'Present'}")
    print()


Food ID 748323:
  kcal: Missing
  protein: Missing
  carbohydrate: Missing
  fiber: Missing
  total_fat: Missing
  saturated_fat: Present
  monounsaturated_fat: Present
  polyunsaturated_fat: Present
  trans_fat: Missing



Clean and adjust dataset to Macro Csv, containing only macronutrient data.

In [None]:
# Load the JSON file
file_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/foundationDownload 2.json'
with open(file_path, 'r') as f:
    data = json.load(f)

# Initialize a list to hold processed data
processed_data = []

# Nutrient IDs for macronutrients, fiber, types of fat, and kcal
nutrient_ids = {
    'protein': 1003,
    'carbohydrate': 1005,
    'total_fat': 1004,
    'fiber': 1079,
    'kcal': 1008,
    'types_of_fat': {
        'saturated_fat': 1258,
        'monounsaturated_fat': 1292,
        'polyunsaturated_fat': 1293,
        'trans_fat': 1257,
    }
}

# Function to extract nutrients from foodNutrients
def extract_nutrients(food_nutrients):
    nutrients = {
        'Protein': 0, 'Carbohydrate': 0, 'Total Fat': 0, 'Fiber': 0,
        'Kcal': 0, 'Saturated Fat': 0, 'Monounsaturated Fat': 0, 'Polyunsaturated Fat': 0, 'Trans Fat': 0
    }

    for nutrient in food_nutrients:
        nutrient_id = nutrient['nutrient']['id']
        amount = nutrient.get('amount', 0)
        if amount < 0:  # if we have a negative value, move it to 0 (positive)
            amount = 0
        if nutrient_id == nutrient_ids['protein']:
            nutrients['Protein'] += amount
        elif nutrient_id == nutrient_ids['carbohydrate']:
            nutrients['Carbohydrate'] += amount
        elif nutrient_id == nutrient_ids['total_fat']:
            nutrients['Total Fat'] += amount
        elif nutrient_id == nutrient_ids['fiber']:
            nutrients['Fiber'] += amount
        elif nutrient_id == nutrient_ids['kcal']:
            nutrients['Kcal'] += amount
        elif nutrient_id == nutrient_ids['types_of_fat']['saturated_fat']:
            nutrients['Saturated Fat'] += amount
        elif nutrient_id == nutrient_ids['types_of_fat']['monounsaturated_fat']:
            nutrients['Monounsaturated Fat'] += amount
        elif nutrient_id == nutrient_ids['types_of_fat']['polyunsaturated_fat']:
            nutrients['Polyunsaturated Fat'] += amount
        elif nutrient_id == nutrient_ids['types_of_fat']['trans_fat']:
            nutrients['Trans Fat'] += amount
    return nutrients

# Process each food item in the dataset
for food_item in data['FoundationFoods']:
    food_id = food_item['fdcId']
    food_name = food_item['description']
    food_nutrients = food_item['foodNutrients']

    nutrients = extract_nutrients(food_nutrients)

    # Adjust Total Fat if it is zero
    if nutrients['Total Fat'] == 0:
        nutrients['Total Fat'] = nutrients['Saturated Fat'] + nutrients['Monounsaturated Fat'] + nutrients['Polyunsaturated Fat'] + nutrients['Trans Fat']

    # Adjust Kcal if it is zero
    if nutrients['Kcal'] == 0:
        nutrients['Kcal'] = (nutrients['Protein'] * 4) + (nutrients['Carbohydrate'] * 4) + (nutrients['Fiber'] * 2) + (nutrients['Total Fat'] * 9)

    processed_data.append({
        'ID': food_id,
        'Food Name': food_name,
        'Kcal': nutrients['Kcal'],
        'Protein': nutrients['Protein'],
        'Carbohydrate': nutrients['Carbohydrate'],
        'Fiber': nutrients['Fiber'],
        'Total Fat': nutrients['Total Fat'],
        'Saturated Fat': nutrients['Saturated Fat'],
        'Monounsaturated Fat': nutrients['Monounsaturated Fat'],
        'Polyunsaturated Fat': nutrients['Polyunsaturated Fat'],
        'Trans Fat': nutrients['Trans Fat']
    })

# Create a DataFrame
df = pd.DataFrame(processed_data)

# Save the DataFrame to a CSV file
output_file = '/content/drive/MyDrive/Colab Notebooks/DS Project food/FoodMacroDetailedInfo.csv'
df.to_csv(output_file, index=False)

print(f"Data has been saved to {output_file}")


Data has been saved to /content/drive/MyDrive/Colab Notebooks/DS Project food/FoodMacroDetailedInfo.csv


Clean and adjust dataset to Macro Csv, containing both macronutrient and micronutrient data.
Note: This is not used in final recommendation system, but is a very useful dataset otherwise!

In [None]:
# Load the JSON file
file_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/foundationDownload 2.json'
with open(file_path, 'r') as f:
    data = json.load(f)

# Initialize a list to hold processed data
processed_data = []

# Nutrient IDs for macronutrients, vitamins, minerals, and fiber
nutrient_ids = {
    'protein': 1003,
    'carbohydrate': 1005,
    'fat': 1004,
    'fiber': 1079,
    'vitamins': [1162, 1165, 1166, 1177, 1184, 1185, 1106, 1107, 1108],  # Add more vitamin IDs as needed
    'minerals': [1087, 1089, 1090, 1091, 1092, 1093, 1095, 1101, 1103]   # Add more mineral IDs as needed
}

# Function to extract nutrients from foodNutrients
def extract_nutrients(food_nutrients):
    nutrients = {'Protein': 0, 'Carbohydrate': 0, 'Fat': 0, 'Fiber': 0, 'Vitamins': [], 'Minerals': []}
    for nutrient in food_nutrients:
        nutrient_id = nutrient['nutrient']['id']
        amount = nutrient.get('amount', 0)
        if nutrient_id == nutrient_ids['protein']:
            nutrients['Protein'] += amount
        elif nutrient_id == nutrient_ids['carbohydrate']:
            nutrients['Carbohydrate'] += amount
        elif nutrient_id == nutrient_ids['fat']:
            nutrients['Fat'] += amount
        elif nutrient_id == nutrient_ids['fiber']:
            nutrients['Fiber'] += amount
        elif nutrient_id in nutrient_ids['vitamins']:
            nutrients['Vitamins'].append(f"{nutrient['nutrient']['name']}: {amount} {nutrient['nutrient']['unitName']}")
        elif nutrient_id in nutrient_ids['minerals']:
            nutrients['Minerals'].append(f"{nutrient['nutrient']['name']}: {amount} {nutrient['nutrient']['unitName']}")
    nutrients['Vitamins'] = ', '.join(nutrients['Vitamins'])
    nutrients['Minerals'] = ', '.join(nutrients['Minerals'])
    return nutrients

# Process each food item in the dataset
for food_item in data['FoundationFoods']:
    food_id = food_item['fdcId']
    food_name = food_item['description']
    food_nutrients = food_item['foodNutrients']

    nutrients = extract_nutrients(food_nutrients)
    processed_data.append({
        'ID': food_id,
        'Food Name': food_name,
        'Protein': nutrients['Protein'],
        'Carbohydrate': nutrients['Carbohydrate'],
        'Fat': nutrients['Fat'],
        'Vitamins': nutrients['Vitamins'],
        'Minerals': nutrients['Minerals'],
        'Fiber': nutrients['Fiber']
    })

# Create a DataFrame
df = pd.DataFrame(processed_data)

# Save the DataFrame to a CSV file
output_file = '/content/drive/MyDrive/Colab Notebooks/DS Project food/FoodMicroMacroInfo.csv'
df.to_csv(output_file, index=False)

print(f"Data has been saved to {output_file}")


Data has been saved to /content/drive/MyDrive/Colab Notebooks/DS Project food/FoodMicroMacroInfo.csv


## **1M Recipe Dataset Work.**

Shorten the recipe dataset and remove bloat (and some data entries).

In [None]:
# Load the CSV file
file_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/preprocessed_layer1.csv'
try:
    # Attempt to load the file directly
    df = pd.read_csv(file_path)
    print("DataFrame loaded successfully.")

    # Remove the first column
    df = df.iloc[:, 1:]

    # Keep only the 'test' partitions
    df = df[df['partition'] == 'test']

    # Remove the 'partition' and 'image url' columns
    df = df.drop(columns=['ID', 'partition', 'image_url'])


    # Save the adjusted DataFrame to a new CSV file
    output_file = '/content/drive/MyDrive/Colab Notebooks/DS Project food/1M_Recepies_Adjusted.csv'
    df.to_csv(output_file, index=False)

    print(f"Data has been saved to {output_file}")
except Exception as e:
    print(f"An error occurred: {e}")


DataFrame loaded successfully.
Data has been saved to /content/drive/MyDrive/Colab Notebooks/DS Project food/1M_Recepies_Adjusted.csv


In [None]:
df

Unnamed: 0,food_title,ingredients,instructions
1,Crunchy Onion Potato Bake,2 12 cups milk /t 1 12 cups water /t 14 cup bu...,Preheat oven to 350 degrees Fahrenheit. /t Spr...
10,"Leek, Potato, and Bacon Casserole","2 leeks, white and light green (about 6 ounces...",Preheat the oven 325 degrees F. /t Butter an 8...
27,German Potato Dumplings,1 cup mashed potatoes /t 1 cup flour /t 1 bunc...,Mix the mashed potatoes together with the flou...
32,Calico Beans,1 lb ground beef /t 12 lb bacon /t 1 cup onion...,Brown beef and bacon; drain. /t Add onion cook...
56,Potato-Leek Soup,1 Tbs. very thinly sliced green onion tops /t ...,"In large pot, heat oil over medium heat. /t Ad..."
...,...,...,...
303245,Aunt Carol's Apple Pie,2 pounds Granny Smith apples /t 1 cup white su...,"Peel and slice apples. /t Toss with sugars, ci..."
303253,Pinwheels,2 bars 8 Ounce Each Cream Cheese At Room Tempe...,Mix the cream cheese and dressing until smooth...
303274,Irish Bread,1 cup raisins /t 1 cup dried currants /t 4 cup...,Preheat oven to 350 degrees F (175 degrees C)....
303282,Easy Olive Spread,1 tub (8 oz.) PHILADELPHIA Cream Cheese Spread...,Mix cream cheese spread and olives; cover. /t ...


Divide the ingredient list into seperate rows for each different ingredient needed for the recipe.

In [None]:
# Load the data
recipes_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/1M_Recepies_Adjusted.csv'  # Update this path to where your dataset is stored
recipes_df = pd.read_csv(recipes_path)

# Function to split the ingredients on '/t' and clean them
def split_and_clean_ingredients(ingredients):
    # Split ingredients on the '/t' delimiter and strip whitespace
    return [ingredient.strip() for ingredient in ingredients.split('/t')]

# Apply the function to split ingredients
recipes_df['ingredients_list'] = recipes_df['ingredients'].apply(split_and_clean_ingredients)

# Explode the DataFrame so each ingredient gets its own row, retaining other recipe data
exploded_df = recipes_df.explode('ingredients_list')
exploded_df.rename(columns={'ingredients_list': 'ingredient'}, inplace=True)

# Drop the original 'ingredients' column
exploded_df.drop(columns=['ingredients'], inplace=True)

# Optional: Save the exploded dataframe to a new CSV for further analysis or processing
output_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/Exploded_Recipes.csv'  # Update this path to where you want to save the output
exploded_df.to_csv(output_path, index=False)

# Print the structure and the first few rows of the expanded DataFrame to confirm
print(exploded_df.info())
print(exploded_df.head())


<class 'pandas.core.frame.DataFrame'>
Index: 407822 entries, 0 to 45296
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   food_title    407822 non-null  object
 1   instructions  407822 non-null  object
 2   ingredient    407822 non-null  object
dtypes: object(3)
memory usage: 12.4+ MB
None
                  food_title  \
0  Crunchy Onion Potato Bake   
0  Crunchy Onion Potato Bake   
0  Crunchy Onion Potato Bake   
0  Crunchy Onion Potato Bake   
0  Crunchy Onion Potato Bake   

                                        instructions  \
0  Preheat oven to 350 degrees Fahrenheit. /t Spr...   
0  Preheat oven to 350 degrees Fahrenheit. /t Spr...   
0  Preheat oven to 350 degrees Fahrenheit. /t Spr...   
0  Preheat oven to 350 degrees Fahrenheit. /t Spr...   
0  Preheat oven to 350 degrees Fahrenheit. /t Spr...   

                                    ingredient  
0                               2 12 cups milk  
0      

Further reduce the dataset to have only 1000 recipes (smaller size convenient for further testing). You can adjust the size of reduction

In [None]:
# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/Exploded_Recipes.csv'
recipes_df = pd.read_csv(file_path)

# Get the first 1000 unique food titles
unique_titles = recipes_df.drop_duplicates(subset=['food_title']).head(1000)

# Merge to get all entries for these unique titles
reduced_recipes_df = recipes_df[recipes_df['food_title'].isin(unique_titles['food_title'])]

# Save the reduced dataset to a new CSV file
reduced_file_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/Reduced_Recipes.csv'
reduced_recipes_df.to_csv(reduced_file_path, index=False)

print(f"Reduced dataset saved to {reduced_file_path}")


Reduced dataset saved to /content/drive/MyDrive/Colab Notebooks/DS Project food/Reduced_Recipes.csv


Remove copies the same recipes, appearing in multiple ladder entries.

In [None]:
# Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/Reduced_Recipes.csv'
recipes_df = pd.read_csv(file_path)

# Remove duplicates by creating a unique key of food_title and ingredient
recipes_df['unique_key'] = recipes_df['food_title'] + recipes_df['ingredient']
recipes_df = recipes_df.drop_duplicates(subset='unique_key').drop(columns='unique_key')

# Remove '/t' from all rows in the instructions column
recipes_df['instructions'] = recipes_df['instructions'].str.replace('/t', '', regex=False)

# Save the cleaned dataset to a new CSV file
cleaned_file_path = '/content/drive/MyDrive/Colab Notebooks/DS Project food/Reduced_Recipescl.csv'
recipes_df.to_csv(cleaned_file_path, index=False)

print("Dataset cleaned and saved successfully.")


Dataset cleaned and saved successfully.


In [None]:
recipes_df

Unnamed: 0,food_title,instructions,ingredient
0,Crunchy Onion Potato Bake,Preheat oven to 350 degrees Fahrenheit. Spray...,2 12 cups milk
1,Crunchy Onion Potato Bake,Preheat oven to 350 degrees Fahrenheit. Spray...,1 12 cups water
2,Crunchy Onion Potato Bake,Preheat oven to 350 degrees Fahrenheit. Spray...,14 cup butter
3,Crunchy Onion Potato Bake,Preheat oven to 350 degrees Fahrenheit. Spray...,"mashed potatoes, 1 box, homestyle"
4,Crunchy Onion Potato Bake,Preheat oven to 350 degrees Fahrenheit. Spray...,1 (8 ounce) can whole kernel corn (drained)
...,...,...,...
25522,Super Nachos,"In a small bowl, mash the avocado with lemon j...",12 lb ground beef
25523,Super Nachos,"In a small bowl, mash the avocado with lemon j...",1 tablespoon chili powder
25524,Super Nachos,"In a small bowl, mash the avocado with lemon j...",7 ounces tortilla chips
25525,Super Nachos,"In a small bowl, mash the avocado with lemon j...",2 cups shredded cheddar cheese
