In [1]:
import pandas as pd

# Read the file as a text file and treat each line as a raw ingredient
with open("unique_ingredients.txt", "r") as file:
    lines = file.readlines()

# Create a DataFrame with a single column 'raw_ingr'
df_start = pd.DataFrame(lines, columns=["raw_ingr"])

# Strip any leading/trailing whitespace from the lines
df_start["raw_ingr"] = df_start["raw_ingr"].str.strip()

# Display the first few rows
print(df_start.head())

                     raw_ingr
0             fuyu persimmons
1  whole crisp corn tortillas
2              cilantro paste
3            gluten-free oats
4      white chocolate wafers


In [None]:
import pickle
import pandas as pd

file_path = "/Users/leventedobak/Downloads/archive/ingr_map.pkl"

with open(file_path, "rb") as file:
    try:
        content = pd.read_pickle(file)
    except ModuleNotFoundError as e:
        print(f"Module not found: {e}")
        file.seek(0)
        raw_data = file.read()
        print("First 100 bytes of raw data for debugging:")
        print(raw_data[:100])  # View the raw content for clues

print(content[content["raw_ingr"]=="hershey's semi-sweet baking chocolate"])


In [None]:
import pandas as pd

# Define a function to generate formatted examples for each 'replaced' entity
def generate_examples(df, column='replaced', num_examples=2):
    examples = []
    grouped = df.groupby(column)
    
    for name, group in grouped:
        # Take at least `num_examples` from each group
        sampled = group.sample(n=min(num_examples, len(group)), random_state=42)  # Limit to available rows if fewer
        for _, row in sampled.iterrows():
            examples.append(f"- Input: {row['raw_ingr']}\n  Answer: Entity: {row['replaced']}")
    
    return examples

# Use the function to generate examples
examples = generate_examples(content, column='replaced', num_examples=2)

# Print the examples
for example in examples:
    print(example)

In [2]:
import requests
import json
from json import JSONDecodeError

LOCAL_CHAT_URL = "http://localhost:11434/api/chat"

def extract_ingredient(text):
    prompt = f"""Task:
    Identify and extract the key ingredient(s) from the following input text. Focus on the main ingredient, typically the shortest or a compound word and most general noun. Respond only with the extracted entity and do not use plural forms in the following format `Entity: ingredient`.

    Examples:
    - Input: medium heads bibb or red leaf lettuce  
      Answer: Entity: lettuce
    - Input: aunt jane's krazy mixed up salt  
      Answer: Entity: salt
    - Input: light cream cheese with chives and onions  
      Answer: Entity: cream cheese
    - Input: fat-free chili  
      Answer: Entity: chili
    - Input: italian-style tomato sauce
      Answer: Entity: tomato sauce
    - Input: tomato sauce with basic and garlic
      Answer: Entity: tomato sauce
    - Input: tomato sauce with roasted red pepper
      Answer: Entity: tomato sauce
    - Input: tomato sauce with italian seasoning
      Answer: Entity: tomato sauce
    - Input: salt-free tomato sauce
      Answer: Entity: tomato sauce
    - Input: duncan hines moist deluxe yellow cake mix
      Answer: Entity: cake mix
    - Input: duncan hines moist deluxe spice cake mix
      Answer: Entity: cake mix
    - Input: zwieback toast
      Answer: Entity: toast
    - Input: zucchini with italian-style tomato sauce
      Answer: Entity: zucchini
    - Input: 15 bean mix
      Answer: Entity: bean
    - Input: 100 proof vodka
      Answer: Entity: vodka
    - Input: 10-minute success rice
      Answer: Entity: rice
    - Input: kikkoman's teriyaki sauce
      Answer: Entity: teriyaki sauce
    - Input: breakstone's sour cream
      Answer: Entity: sour cream
    - Input: hershey's hugs chocolates
      Answer: Entity: chocolate
    - Input: hershey's semi-sweet chocolate chips
      Answer: Entity: chocolate chip
    - Input: hershey's semi-sweet baking chocolate
      Answer: Entity: baking chocolate
    - Input: prego spaghetti sauce
      Answer: Entity: spaghetti sauce
    - Input: pasta sauce
      Answer: Entity: pasta sauce
    - Input: pillsbury pecan swirl quick bread and coffee
      Answer: Entity: bread
    - Input: pillsbury cinnamon swirl quick bread and coffe
      Answer: Entity: bread
    - Input: mozzarella cheese with sun-dried tomatoes and
      Answer: Entity: mozzarella
    - Input: kraft shredded mozzarella cheese with a touch
      Answer: Entity: mozzarella
    - Input: campbell's condensed tomato soup
      Answer: Entity: tomato soup
    - Input: stewed tomatoes with herbs
      Answer: Entity: tomato
    - Input: cajun-style stewed tomatoes
      Answer: Entity: tomato
    - Input: healthy request condensed tomato soup
      Answer: Entity: tomato soup
    - Input: campbell's condensed tomato soup
      Answer: Entity: tomato soup
    - Input: condensed tomato soup with roasted garlic
      Answer: Entity: tomato soup

    Input: {text}

    Answer:"""

    data = {
        "model": "llama3",
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
    }
    headers = {
        "Content-Type": "application/json"
    }

    try:
        # Send POST request
        response = requests.post(LOCAL_CHAT_URL, headers=headers, json=data)

        # Check for HTTP errors
        if response.status_code != 200:
            print(f"Error! Status Code: {response.status_code}")
            print(f"Response Content: {response.text}")
            return None

        # Parse NDJSON response
        raw_lines = response.text.splitlines()  # Split by newline
        parsed_content = [json.loads(line) for line in raw_lines]  # Parse each line

        # Extract the relevant content from parsed messages
        extracted_content = "".join(
            item["message"]["content"] for item in parsed_content if "message" in item
        ).strip()

        return extracted_content.split("Entity:")[-1].strip()

    except JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
        print(f"Response Content: {response.text}")
        return None

    except requests.RequestException as e:
        print(f"RequestException: {e}")
        return None




In [6]:
from tqdm import tqdm

tqdm.pandas()

df = pd.DataFrame({
    'raw_ingr': df_start['raw_ingr'],  # Original raw ingredient
    'extracted_ingredient': df_start['raw_ingr'].progress_apply(extract_ingredient)  # Extracted ingredient
})

100%|██████████| 14942/14942 [3:40:01<00:00,  1.13it/s]  


In [7]:
# Display the new DataFrame
print(df.head())

                     raw_ingr extracted_ingredient
0             fuyu persimmons            persimmon
1  whole crisp corn tortillas                 corn
2              cilantro paste             cilantro
3            gluten-free oats                 oats
4      white chocolate wafers      white chocolate


In [8]:
df.to_excel('ingredients.xlsx', index=False)