In [None]:
!pip install -qqq datasets pandas pymongo sentence_transformers accelerate gradio
!pip install -qqq -U transformers

In [None]:
# Load Dataset
import pandas as pd

In [None]:
# https://huggingface.co/datasets/recipe_nlg
# Download from https://recipenlg.cs.put.poznan.pl/ and upload to drive
!unzip /content/drive/MyDrive/data/recipe-rag/dataset.zip -d /content

In [None]:
!head -n3 /content/dataset/full_dataset.csv

In [None]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")

In [None]:
def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

In [None]:
import pymongo
from google.colab import userdata


def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None


mongo_uri = userdata.get("MONGO_URI")
if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB
db = mongo_client["recipe"]
collection = db["recipe_collection"]

In [None]:
def unlistify_string(input):
  return input.replace('" , "', " ").translate({ord(c): None for c in '[]"'}).replace(".,", ".")

In [None]:
def populate_dataset(path="/content/dataset/full_dataset.csv", n=1000):
  dataset_df = pd.read_csv(path, nrows=n, index_col=0)
  dataset_df.drop(columns=["link", "source", "NER"], inplace=True)
  dataset_df.dropna(inplace=True)
  dataset_df["ingredients"] = dataset_df["ingredients"].apply(unlistify_string)
  dataset_df["directions"] = dataset_df["directions"].apply(unlistify_string)
  dataset_df["embedding"] = dataset_df["directions"].apply(get_embedding)
  documents = dataset_df.to_dict("records")
  # Optionally delete if collection is already populated
  collection.delete_many({})
  collection.insert_many(documents)

In [None]:
populate_dataset(n=1000)

In [None]:
def vector_search(user_query, collection):

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,
                "title": 1,
                "ingredients": 1,
                "directions": 1,
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [None]:
def get_search_result(query, collection):

    get_knowledge = vector_search(query, collection)

    print(type(get_knowledge[0].get("ingredients")))

    search_result = ""
    for result in get_knowledge:
        search_result += f"Recipe Name: {result.get('title', 'N/A')}, Ingredients: {result.get('ingredients', 'N/A')}, Directions: {result.get('directions', 'N/A')}\n"

    return search_result

In [None]:
# Conduct query with retrival of sources
query = "What is the best recipe for making a dish using milk?"

source_information = get_search_result(query, collection)
combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."

print(combined_information)

In [None]:
from huggingface_hub import login
login(token=userdata.get("HF_TOKEN"))

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

In [None]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))