In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import re
import ast

In [17]:
# Load the product info dataset
products_df = pd.read_csv('product_info.csv')

# Load the known allergen dataset
allergens_df = pd.read_csv('known_allergens.csv')

# Create a dictionary from the allergens dataset for quick lookup
allergen_dict = dict(zip(allergens_df['Ingredient'], allergens_df['Known Allergens']))

In [18]:
# Function to parse and clean ingredient lists
def parse_ingredients(ingredient_str):
    try:
        ingredients_list = ast.literal_eval(ingredient_str)
        flat_list = [item for sublist in ingredients_list for item in (sublist.split(',') if isinstance(sublist, str) else sublist)]
        return [ingredient.strip() for ingredient in flat_list if ingredient.strip()]
    except (ValueError, SyntaxError):
        return []

# Apply parsing function to 'ingredients' column
products_df['parsed_ingredients'] = products_df['ingredients'].apply(parse_ingredients)

In [19]:
# Function to map allergens based on parsed ingredients
def map_allergens(ingredients):
    allergens = []
    for ingredient in ingredients:
        if ingredient in allergen_dict:
            allergens.append(f"{ingredient}: {allergen_dict[ingredient]}")
    return allergens if allergens else ['No known allergens']

# Apply allergen mapping function
products_df['allergen_info'] = products_df['parsed_ingredients'].apply(map_allergens)

In [20]:
# Export the updated dataset to a new CSV file
products_df.to_csv('product_info_with_allergens.csv', index=False)

print("Data processed and exported to 'product_info_with_allergens.csv'")

Data processed and exported to 'product_info_with_allergens.csv'
