# Embedding of Loaded data and Pushing to MongoDB

### In this notebook we will be loading the data from csv format and creating Vector embedding using HF and pushing to MongoDB

In [None]:
# !pip install -q langchain langchain-mongodb langchain-chroma langchain-cli langchain-community langchain-core langchain-huggingface langchain-text-splitters jq pymongo

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Starting with importing the libraries
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import JSONLoader, PyPDFDirectoryLoader
import os
from dotenv import load_dotenv
load_dotenv()
from langchain.docstore.document import Document
import pandas as pd

In [None]:
df = pd.read_csv(r"C:\Users\AKSHAT SHAW\OneDrive - iitr.ac.in\Desktop\Side-Projects\Zomato_RAG\data\restaurant_menus.csv")

docs = []
for _, r in df.iterrows():
    text = (
        f"Restaurant: {r['restaurant_name']}\n"
        f"Rating:     {r['rating']}\n"
        f"Price one:  {r['price_for_one']}\n"
        f"Cuisines:   {r['cuisine']}\n"
        f"Item:       {r['item_name']}\n"
        f"Description:{r['description'] if pd.notna(r['description']) else 'No description'}\n"
        f"Price:      {r['price']}"
    )
    docs.append(Document(page_content=text))

In [None]:
from sentence_transformers import SentenceTransformer
# Load the embedding model (https://huggingface.co/nomic-ai/nomic-embed-text-v1")
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
def get_embedding(data):
    """Generates vector embeddings for the given data."""
    embedding = model.encode(data)
    return embedding.tolist()

In [None]:
from pymongo import MongoClient
# Connect to your Atlas cluster
uri = os.getenv("MONGODB_URI")
client = MongoClient(uri)
collection = client["rag_db"]["test"]


In [None]:
docs_to_insert = [{
    "text": doc.page_content,
    "embedding": get_embedding(doc.page_content)
} for doc in docs]

In [None]:
# Insert documents into the collection
result = collection.insert_many(docs_to_insert)

In [None]:
from pymongo.operations import SearchIndexModel
import time
# Create index model, then create the search index
index_name="vector_index"
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",
        "numDimensions": 768,
        "path": "embedding",
        "similarity": "cosine"
      }
    ]
  },
  name = index_name,
  type = "vectorSearch"
)
collection.create_search_index(model=search_index_model)
# Wait for initial sync to complete
print("Polling to check if the index is ready. This may take up to a minute.")
predicate=None
if predicate is None:
   predicate = lambda index: index.get("queryable") is True
while True:
   indices = list(collection.list_search_indexes(index_name))
   if len(indices) and predicate(indices[0]):
      break
   time.sleep(5)
print(index_name + " is ready for querying.")

In [None]:
# Define a function to run vector search queries
def get_query_results(query):
  """Gets results from a vector search query."""

  query_embedding = get_embedding(query)
  pipeline = [
      {
            "$vectorSearch": {
              "index": "vector_index",
              "queryVector": query_embedding,
              "path": "embedding",
              "exact": True,
              "limit": 10
            }
      }, {
            "$project": {
              "_id": 0,
              "text": 1
         }
      }
  ]

  results = collection.aggregate(pipeline)

  array_of_results = []
  for doc in results:
      array_of_results.append(doc)
  return array_of_results

# Test the function with a sample query
# import pprint
# pprint.pprint(get_query_results("Chicken Biryani"))

In [None]:
os.environ['HUGGINGFACEHUB_API_TOKEN']

# Tesing responses

In [None]:
from langchain_community.llms import HuggingFaceHub

hf=HuggingFaceHub(
    repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1", #meta-llama/Llama-3.1-8B-Instruct
    model_kwargs={"temperature":0.3}

)

In [None]:
def build_prompt(question, context):
    system_prompt = (
        "You are a helpful assistant that answers user questions about restaurants using only the provided restaurant data.\n"
        "If the answer cannot be determined from the context, reply with 'I'm not sure based on the available information.'\n"
        "Always be concise, factual, and context-aware."

    )

    user_prompt = f"""Question: {question}

Context:
{context}

Answer:"""

    # Combine system and user messages
    return f"{system_prompt}\n\n{user_prompt}"


In [None]:
question = "Where can i find best fish?"


context_docs = get_query_results(question)

#  Retrieve context
# retrieved_docs = retriever.get_relevant_documents(question) #Based on Vector db not mongodb
context = "\n\n---\n\n".join(text['text'] for text in context_docs)

def format_docs(context_docs):
  return "\n\n---\n\n".join(text['text'] for text in context_docs)

input_prompt = build_prompt(question, context)

#   Invoke LLM
print("\n\n-----------------------------------------------------")
print("🧠 AI Response:\n")
response = hf.invoke(input_prompt)
print("AI:", response.split("Answer")[1])
print("\n\n-----------------------------------------------------")
print( response.split("Answer")[0])
