# Simple LangChain Index of Recipees to Vector DB.  

Recipe data from : https://www.kaggle.com/datasets/pes12017000148/food-ingredients-and-recipe-dataset-with-images?rvi=1

## Improt Environement Variables from File
Need to have a .env file with OPENAI_API_KEY and a LANGSMITH_API_KEY

In [None]:
from environs import Env
import os
import bs4
from langchain import hub
import csv
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate

env = Env()
env.read_env("/Users/geoffreysmalling/development/langchain/.env")

## connect to OpenAI

In [None]:
llm = ChatOpenAI(api_key=env.str("OPENAI_API_KEY"), model="gpt-3.5-turbo-0125")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = env.str("LANGSMITH_API_KEY")



## Test llm connection and get answer

In [None]:
llm.invoke("How do you prepare slamon sushi?")

# load, chuck, and index contents of the source
use the CSV native Parser to parse file

In [None]:

# recipe_id is stored in column 0
# title is stored in column 1
# Instructions is stored in column 4
# Cleaned_Ingredients is stored in column 5

# loop through CSV file and create a text document containing
# recipe_id, title, instructions, and cleaned ingredients
# also concatenate title, instructions and cleaned ingredients into one document
# and store in a list
from langchain_core.documents import Document

recipes_docs = list()
with open('/Users/geoffreysmalling/development/langchain/data/epicurious_recipe.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            recipe_text = row[1] + "\n\n " + row[4] + "\n\n" + row[5]
            source = "epicurious - recipe title " + row[1]
            recipe = {"id": row[0], "title": row[1], "instructions": row[3], "cleaned_ingredients": row[5], "recipe_text": recipe_text, "source":source}
            # build a langchain document for each recipe
            recipe_doc = Document(page_content=recipe['recipe_text'], metadata = recipe )
            recipes_docs.append(recipe_doc)  
            line_count += 1
print(f'Processed {line_count} lines.')
print("type of docs object: " + str(type(recipes_docs[0])))
print("type of docs object: " + str(recipes_docs[0]))
print("number of docs: " + str(len(recipes_docs)))






## split the docs into chunks


In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(recipes_docs)
print("type of splits object: " + str(type(splits)))
print("number of splits: " + str(len(splits)))
print("metadata example: " + str(splits[10].metadata))

## store in a vector store using openAI Embeddings model

In [None]:
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [None]:
print("type of vectorstore object: " + str(type(vectorstore)))

## create a retriever and pull a prompt from the langsmith hub

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 30})
retrieved_docs = retriever.invoke("what recipes contain garlic, rosemary, pepper and chicken")
print(len(retrieved_docs))
print(retrieved_docs[2].page_content)
print(retrieved_docs[3].metadata['source'])




## download a prompt for rag from the hub

In [None]:

#prompt = hub.pull("rlm/rag-prompt")
from langchain_core.prompts import ChatPromptTemplate

template = """You are a cooking assistant. You are to help people find recipes and prepare them. Use the following context to contrain your recipe knowledge for the query
{context}

Question: {question}

Take some time to think about the question and the context.

If the user is asking for a recipe, give them instructions for the recipe in the following format:
Ingredients: List the ingredients needed for this recipe

If there are any preperation steps required, list that first as prep instrucitons.

Prep Instructions: 
  estimate the prep time
  list the steps to prepare the recipe.

Instructions: 
    estimate the cook time
    List the steps to cook this recipe. 

Cite the recipe you based your answer off of from the context.

If the user is looking for general knowledge on a technique of cooking, explain the technique and list them sample recipes to try.

Also, suggest some similar recipes to the user, in addition to the one you picked

"""
prompt = ChatPromptTemplate.from_template(template)


### create a method to break each chunk of docs into a new paragraph for the prompt context

In [None]:
def format_docs(docs):
    context =  "\n\n".join(doc.page_content for doc in docs)    
    return context
    

## Build the LangChain
- User the retriever to get context docs and then join them with format docs
- Get the question to the llm via RunnablePassthrough
- pass context and question to the prompt
- pass prompt to llm
- parse results to a string, vs a message

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    #| StrOutputParser()
)

In [None]:
response = rag_chain.invoke("What are ways to bbq or grill chicken?")

In [None]:
print(response)

## adding sources


In [None]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

response = rag_chain_with_source.invoke("My kids do not like spicey food. My kids like simple food.  What can I make my kids with pasta and chicken?")

print(type(response))

In [None]:
### we can get the context and source from the metadata

In [None]:
print(response['context'][1].metadata['source'])

In [None]:
print(response['answer'])