In [1]:
import pandas as pd
import ast
from langchain_core.documents import Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
import chromadb
from langchain_chroma import Chroma

In [2]:
data_list = []
df = pd.read_csv("data/processed_recipes.csv")
df["ingredients"] = df["ingredients"].apply(ast.literal_eval)
df["directions"] = df["directions"].apply(ast.literal_eval)
df["NER"] = df["NER"].apply(ast.literal_eval)

In [3]:
df.shape

(77000, 23)

In [4]:
df.head()

Unnamed: 0,title,ingredients,directions,NER,category,is_nut_free,is_gluten_free,is_dairy_free,is_spicy_food,is_comfort_food,...,is_breakfast,is_lunch,is_dinner,is_no_oven,is_slow_cooker,is_air_fryer,is_one_pot,steps_counts,has_passive_time,is_quick
0,Butter Baked Rice (Oamc),"[1 cup long grain rice, 1 teaspoon salt, 1/3 c...",[Measure rice and salt in a bowl and pour on b...,"[long grain rice, salt, butter, garlic, chicke...",Baking,0,1,0,0,1,...,0,0,1,0,0,0,0,11,1,0
1,Vegan Orange Dream,"[1 12 cups orange juice, chilled, 1 cup light ...","[In a blender, combine everything except the o...","[orange juice, light vanilla soymilk, silk, ho...",Baking,1,1,0,0,0,...,0,0,0,1,0,0,0,3,1,0
2,Baked Lima Beans,"[1 lb. large dried lima beans, soaked overnigh...","[Soak beans overnight and rinse., Cover with c...","[beans, brown sugar, dry mustard, onion, molas...",Baking,1,1,1,0,1,...,0,0,1,0,0,0,0,9,1,0
3,Stromboli,"[3 c. flour, 1 pkg. dry yeast, 2 Tbsp. sugar, ...","[Combine 1 1/4 cups flour, yeast, sugar and sa...","[flour, yeast, sugar, cooking oil, salt, water...",Baking,1,0,0,0,0,...,0,0,0,1,0,0,0,8,1,0
4,Baked Clams,"[3 pounds Littleneck clams, 1 cup cornmeal, 2 ...","[Preheat oven to 350 degrees F., Place clams i...","[Littleneck clams, cornmeal, salt, bread crumb...",Baking,1,1,0,0,1,...,0,0,0,0,0,0,0,9,0,0


In [6]:
df.columns

Index(['title', 'ingredients', 'directions', 'NER', 'category', 'is_nut_free',
       'is_gluten_free', 'is_dairy_free', 'is_spicy_food', 'is_comfort_food',
       'is_light_food', 'is_hearty_food', 'is_healthy', 'is_breakfast',
       'is_lunch', 'is_dinner', 'is_no_oven', 'is_slow_cooker', 'is_air_fryer',
       'is_one_pot', 'steps_counts', 'has_passive_time', 'is_quick'],
      dtype='str')

In [7]:
model_name = "sentence-transformers/all-MiniLM-L12-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name, 
                                   model_kwargs = {'device': "cuda"},
                                    encode_kwargs = {'normalize_embeddings': False})



Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [8]:
client_settings = chromadb.config.Settings(
    anonymized_telemetry=False,
    is_persistent=True
)
vector_store = Chroma(
    collection_name = "recipe_dataset",
    embedding_function = embeddings,
    persist_directory="data",
    client_settings=client_settings,
)

In [9]:
dietary_type_cols = ['is_nut_free', 'is_gluten_free','is_dairy_free', 'is_spicy_food', 'is_comfort_food', 'is_light_food', 'is_hearty_food', 'is_healthy', 
                     'is_breakfast', 'is_lunch', 'is_dinner', 'is_no_oven',
                     'is_slow_cooker', 'is_air_fryer', 'is_one_pot', 'steps_counts', 'has_passive_time', 'is_quick']

In [11]:
def batch_process_recipes(batch_size = 1000):
    total_docs = len(df)
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
    # chunk_number = 0
    for i in range(0, total_docs, batch_size):
        print(f"{i} docs loaded")
        data = df[i:min(i+batch_size, total_docs)]
        documents = []
        for row_idx, row in data.iterrows():
            page_content = f"# Title: {row['title']}\n"
            page_content += "### Ingredients:\n"
            page_content += "\n".join([f"{i+1}. {ingredient}" for i, ingredient in enumerate(row['ingredients'])])
            page_content += "\n### Directions: \n"
            page_content += "\n".join([f"{i+1}. {step}" for i, step in enumerate(row['directions'])])
            meta_data = {
                "title": row['title'], 
                "doc_id": f"recipe_{row_idx}",
                "category": row['category'],
            }
            if len(row['NER']) > 0:
                meta_data["ingredients"] = row['NER']
            else:
                meta_data["ingredients"] = row['ingredients']
            for dietary_type in dietary_type_cols:
                meta_data[dietary_type] = row[dietary_type]
            document = Document(page_content=page_content, metadata=meta_data)
            documents.append(document)
        # chunks = text_splitter.split_documents(documents)
        # print(f"{i} === {len(chunks)} loaded")
        # for index, chunk in enumerate(chunks):
        #     chunk.metadata["seq_num"] = index + chunk_number
        # chunk_number += len(chunks)
        vector_store.add_documents(documents)

batch_process_recipes()

70000 docs loaded
71000 docs loaded
72000 docs loaded
73000 docs loaded
74000 docs loaded
75000 docs loaded
76000 docs loaded


In [12]:
client = chromadb.PersistentClient(path="data", settings=client_settings)
collection = client.get_collection(name="recipe_dataset")

In [13]:
collection.count()

77000

In [14]:
results = collection.get(
    limit=2,
    include=["documents", "metadatas"] # 'embeddings' is optional and large
)
results

{'ids': ['e6d5a705-b785-45e5-a6b1-d84f98b3b507',
  '1cc3d452-de16-4dc5-8c1f-32f5913e7ff0'],
 'embeddings': None,
 'documents': ['# Title: Butter Baked Rice (Oamc)\n### Ingredients:\n1. 1 cup long grain rice\n2. 1 teaspoon salt\n3. 1/3 cup butter\n4. 1 teaspoon garlic powder\n5. 2 cups chicken stock (or vegetable stock)\n6. 3 teaspoons parsley\n7. 1/4 cup slivered almonds (optional)\n### Directions: \n1. Measure rice and salt in a bowl and pour on boiling water just to cover. Let stand 30 minutes.\n2. Drain and rinse with cold water.\n3. Melt butter in a frying pan and add rice. Saute for 5 minutes, stirring often until most of the butter is absorbed.\n4. Transfer to a greased 1L casserole dish.\n5. Combine garlic powder and chicken stock and pour over rice. Cover and bake at 350 degrees for 45-60 minutes.\n6. Add parsley and fluff with a fork.\n7. Sprinkle with almonds and bake for an additional 5 minutes.\n8. Cool completely, label and freeze.\n9. To serve, thaw overnight and reheat.\