In [1]:
import chromadb
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document


In [2]:
db_path = "data"
collection_name = "recipe_dataset"

In [3]:
model_name = "sentence-transformers/all-MiniLM-L12-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name,
                                   model_kwargs={'device': "cuda"},
                                   encode_kwargs={'normalize_embeddings': False})



Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [5]:
client_settings = chromadb.config.Settings(
    anonymized_telemetry=False,
    is_persistent=True
)
client = chromadb.PersistentClient(path=db_path, settings=client_settings)
collections = client.list_collections()
collections

[Collection(name=recipe_dataset)]

In [7]:
collection = client.get_collection(name=collection_name)
collection.count()

77000

In [8]:
results = collection.get(
    limit=2,
    include=["documents", "metadatas"] # 'embeddings' is optional and large
)
results

{'ids': ['e6d5a705-b785-45e5-a6b1-d84f98b3b507',
  '1cc3d452-de16-4dc5-8c1f-32f5913e7ff0'],
 'embeddings': None,
 'documents': ['# Title: Butter Baked Rice (Oamc)\n### Ingredients:\n1. 1 cup long grain rice\n2. 1 teaspoon salt\n3. 1/3 cup butter\n4. 1 teaspoon garlic powder\n5. 2 cups chicken stock (or vegetable stock)\n6. 3 teaspoons parsley\n7. 1/4 cup slivered almonds (optional)\n### Directions: \n1. Measure rice and salt in a bowl and pour on boiling water just to cover. Let stand 30 minutes.\n2. Drain and rinse with cold water.\n3. Melt butter in a frying pan and add rice. Saute for 5 minutes, stirring often until most of the butter is absorbed.\n4. Transfer to a greased 1L casserole dish.\n5. Combine garlic powder and chicken stock and pour over rice. Cover and bake at 350 degrees for 45-60 minutes.\n6. Add parsley and fluff with a fork.\n7. Sprinkle with almonds and bake for an additional 5 minutes.\n8. Cool completely, label and freeze.\n9. To serve, thaw overnight and reheat.\

In [9]:
batch_size = 2000 
offset = 0
bm25_docs = []
total_docs = collection.count()
for i in range(0, total_docs, batch_size):
    batch = collection.get(
        include=["documents", "metadatas"],
        limit=batch_size,
        offset=offset
    )
    
    docs = batch["documents"]
    metadatas = batch["metadatas"]
    
    for doc, meta in zip(docs, metadatas):
        bm25_docs.append(Document(page_content=doc, metadata=meta))
        
    offset += batch_size
    print(f"Loaded {len(bm25_docs)} documents...")

Loaded 2000 documents...
Loaded 4000 documents...
Loaded 6000 documents...
Loaded 8000 documents...
Loaded 10000 documents...
Loaded 12000 documents...
Loaded 14000 documents...
Loaded 16000 documents...
Loaded 18000 documents...
Loaded 20000 documents...
Loaded 22000 documents...
Loaded 24000 documents...
Loaded 26000 documents...
Loaded 28000 documents...
Loaded 30000 documents...
Loaded 32000 documents...
Loaded 34000 documents...
Loaded 36000 documents...
Loaded 38000 documents...
Loaded 40000 documents...
Loaded 42000 documents...
Loaded 44000 documents...
Loaded 46000 documents...
Loaded 48000 documents...
Loaded 50000 documents...
Loaded 52000 documents...
Loaded 54000 documents...
Loaded 56000 documents...
Loaded 58000 documents...
Loaded 60000 documents...
Loaded 62000 documents...
Loaded 64000 documents...
Loaded 66000 documents...
Loaded 68000 documents...
Loaded 70000 documents...
Loaded 72000 documents...
Loaded 74000 documents...
Loaded 76000 documents...
Loaded 77000 doc

In [10]:
bm25_retriever = BM25Retriever.from_documents(bm25_docs)


In [18]:
import pickle

index_path = "data/bm25_retriever.pkl"
with open(index_path, "wb") as f:
    pickle.dump(bm25_retriever, f)

In [11]:
vector_store = Chroma(
    collection_name = "recipe_dataset",
    embedding_function = embeddings,
    persist_directory="data",
    client_settings=client_settings,
)

  vector_store = Chroma(


In [12]:
bm25_retriever.k = 5
semantic_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [13]:
from langchain_classic.retrievers import EnsembleRetriever

hybrid_retriever = EnsembleRetriever(
    retrievers = [semantic_retriever, bm25_retriever],
    weights = [0.6,0.4]
)

In [14]:
from sentence_transformers import CrossEncoder
re_rank_model =  CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: cross-encoder/ms-marco-MiniLM-L-6-v2
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [15]:
def re_rank(query, query_result, check_score=True):
    # cross-encoder re-ranker
    docs = []
    for doc in query_result:
        docs.append(doc.page_content)
    ranks = re_rank_model.rank(query, docs)
    # print(f"ranks: {ranks}")
    if ranks[0]["score"] < 0:
        return [query_result[ranks[0]["corpus_id"]]]
    reranked_docs = []
    for rank_score in ranks:
        if check_score and rank_score["score"] < 0:
            continue
        else:
            reranked_docs.append(query_result[rank_score["corpus_id"]])
    return reranked_docs

In [16]:
def get_relevant_docs(query, should_re_rank=False):
    print(f"Query: {query}")
    query_result = hybrid_retriever.invoke(query)
    ids = []
    titles = []
    for doc in query_result:
        titles.append(doc.metadata['title'])
        ids.append(doc.metadata["doc_id"])
    print(f"initial Titles: {titles}")
    if should_re_rank:
        re_ranked_docs = re_rank(query, query_result)
    else:
        re_ranked_docs = query_result
    titles = []
    for doc in re_ranked_docs:
        titles.append(doc.metadata['title'])
    print(f"reranked title: {titles}")
    return {
        "doc_ids": ids,
        "relevant_documents": re_ranked_docs
    }

In [17]:
get_relevant_docs("How is corn used in Tamale bake recipe?", should_re_rank=True)

Query: How is corn used in Tamale bake recipe?
initial Titles: ['Tamale Bake', 'Corn Bake', 'Corn Bake', 'Escalloped Corn', 'Escalloped Corn', 'Beef Tamale Bake', 'Fiesta Tamale Casserole', 'Noodle Salad', 'Tamale Joe', 'Scalloped Corn']
reranked title: ['Tamale Bake', 'Beef Tamale Bake', 'Tamale Joe', 'Fiesta Tamale Casserole', 'Corn Bake']


{'doc_ids': ['recipe_2398',
  'recipe_2746',
  'recipe_6589',
  'recipe_47273',
  'recipe_45579',
  'recipe_119',
  'recipe_39655',
  'recipe_30699',
  'recipe_39089',
  'recipe_42654'],
 'relevant_documents': [Document(metadata={'is_spicy_food': 1, 'steps_counts': 18, 'is_one_pot': 0, 'is_air_fryer': 0, 'is_light_food': 0, 'is_slow_cooker': 0, 'is_comfort_food': 1, 'is_breakfast': 1, 'is_healthy': 0, 'is_dairy_free': 0, 'ingredients': ['ground beef', 'onion', 'salt', 'chili powder', 'tomatoes', 'corn muffin', 'corn', 'olives', 'Cheddar cheese'], 'doc_id': 'recipe_2398', 'category': 'Baking', 'has_passive_time': 1, 'is_nut_free': 1, 'is_quick': 0, 'is_lunch': 0, 'is_no_oven': 0, 'title': 'Tamale Bake', 'is_dinner': 1, 'is_hearty_food': 1, 'is_gluten_free': 1}, page_content='# Title: Tamale Bake\n### Ingredients:\n1. 1 1/2 lb. ground beef\n2. 1 onion, chopped\n3. 1 tsp. salt\n4. 4 tsp. chili powder\n5. 2 c. canned tomatoes\n6. 1 pkg. Jiffy corn muffin mix\n7. 1 c. corn\n8. 1 can olives,