## Dependencies

In [None]:
%pip install langchain faiss-cpu dotenv openai beautifulsoup4 lark

## Vector Datastore - Basic example of text splitting & Document creation

In [None]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

raw_documents = TextLoader('./state_of_the_union.txt').load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = FAISS.from_documents(documents, OpenAIEmbeddings())

In [None]:
query = "What did the president say about Ketanji Brown Jackson"
docs = db.similarity_search(query)
print(docs[0].page_content)

## Vector Store - Data Prep (Tagging)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
import os

# Load .env file
load_dotenv('./.env')

# Get OPENAI_API_KEY and BEARER_TOKEN from .env file
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# initialize LLM (we use ChatOpenAI because we'll later define a `chat` agent)
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    temperature=0,
    model_name='gpt-3.5-turbo'
)


prompt = PromptTemplate.from_template(
"""You are a content tagging bot for a food and drink blog, and your role is to identify and tag recipes. Specify whether the item is a food or drink and then focus on the type (e.g., vegan, gluten-free), unique ingredients (e.g., dried fruit, super seeds), cooking or preparation techniques (e.g., grilling, soaking, mixing), dietary restrictions, cultural origins, meal or occasion types (e.g., breakfast, lunch, dinner, cocktail party), and special flavors or features (e.g., sweet, savory, spicy) that stand out in the given recipe.

Recipe: {recipe}

Feel free to add any tags that may provide insightful information about the dish or drink. There's no maximum number of tags, so be thorough and descriptive in your tagging, as it helps readers find recipes that match their preferences.
YOU MUST return the tags in the following format:

Category: Drink, Type: Vegan, Unique Ingredients: Mint leaves, Preparation Techniques: Mixing, Dietary Restrictions: Gluten-free, Cultural Origins: Cuban, Occasion: Cocktail party, Special Features: Refreshing]

"""
)

chain = LLMChain(llm=llm, prompt=prompt)

async def generateTagsFromContent(content):
    return chain.invoke({"recipe": content})

In [None]:
## Load the JSON API export, clean the HTML, and create a Document object for each page

from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from bs4 import BeautifulSoup
import asyncio
import json

# Open the file for reading
with open('./ingest/composetheweb.json', 'r') as file:
    jsonapi_export = json.load(file)

def clean_html(html_content):
    return BeautifulSoup(html_content, 'html.parser').get_text()


def create_document(doc):
    title = doc['attributes']['title']
    source = doc['attributes']['path']['alias']
    metadata = {'title': title, 'source': source}
    
    if doc['type'] == 'node--recipe':
        difficulty = str(doc['attributes']['field_difficulty'])
        ingredients = str(doc['attributes']['field_ingredients'])
        recipe = clean_html(str(doc['attributes']['field_recipe_instruction']['value']))
        summary = clean_html(str(doc['attributes']['field_summary']['value']))
        
        page_content = f"Title: {title},\nDifficulty: {difficulty},\n\nIngredients: {ingredients},\n\nRecipe: {recipe}, \n\nSummary: {summary}"
    else:
        body = clean_html(str(doc['attributes']['body']['value']))
        page_content = f"Title: {title} - {body}"
    
    return Document(metadata=metadata, page_content=page_content)


docs = [create_document(doc) for doc in jsonapi_export['data']]


generateTagPromises = [generateTagsFromContent(doc.page_content) for doc in docs]
tags = await asyncio.gather(*generateTagPromises)

In [None]:
# print the lenth of the docs and the tags
print(len(docs))
print(len(tags))


print(docs[8].page_content)
print('------------------')
print(tags[8]['text'])

In [None]:
updatedDocs = []
for index, doc in enumerate(docs):
    updatedDocs.append(
        Document(
            metadata={**doc.metadata, 'tags': tags[index]['text']}, 
            page_content=doc.page_content
        )
    )

print(updatedDocs[8])

In [None]:
print("Creating vector store...")
vectorstore: FAISS = FAISS.from_documents(updatedDocs, OpenAIEmbeddings())
vectorstore.save_local("data")

In [None]:
print(query)

found_docs = vectorstore.similarity_search_with_relevance_scores(query, n_docs=5, score_threshold=0.7 )
found_docs

print(found_docs[0][0].metadata['title'])
print(found_docs[0][0].metadata)

## Document Retrievers

### Similarity Search with relevance scores

In [None]:
def retriever_with_source(query, n_docs):
    hits = vectorstore.similarity_search_with_relevance_scores(query, n_docs, score_threshhold=0.7)
    for hit in hits:
        hit.metadata["source"] = hit.metadata["source"] 
    return hits

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain

qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm, retriever=vectorstore.as_retriever())

result = qa_chain({"question": query})
result

### RetrievalQA Chain w/ context

In [None]:
## RetrievalQA chain w/ context

from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Answer with single sentence description of the dish. Never provide commentary on the context. DO NOT INCLUDE THE RECIPES IN YOUR ANSWER. Finish the answer with a question if the user would like to see the full recipe.
{context}

Question:
{question}

"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)
result = qa_chain({"query": query })
print(result['result'])
print(result["source_documents"][0].metadata["source"])

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chains import create_qa_with_sources_chain
from langchain.memory import ChatMessageHistory
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain


template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer. 
Answer with single sentence description of the dish. Never provide commentary on the context. DO NOT INCLUDE THE RECIPES IN YOUR ANSWER. Finish the answer with a question if the user would like to see the full recipe.
{context}

History:
{chat_history}

Question:
{question}

"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True, output_key='answer')
retriever = vectorstore.as_retriever()

# Create the multipurpose chain
qachat = ConversationalRetrievalChain.from_llm(
    llm,
    memory=memory,
    retriever=retriever, 
    return_source_documents=True,
    condense_question_prompt=QA_CHAIN_PROMPT
)

question = "Do we have any recipes for Ukrainian food?"
qachat(question)


In [None]:
from langchain.prompts import PromptTemplate

template = """Answer the question in your own words as truthfully as possible from the context given to you.
If you do not know the answer to the question, simply respond with "I don't know. Can you ask another question".
If questions are asked where there is no relevant context available, simply respond with "I don't know. Please ask a question relevant to the documents"
Context: {context}


{chat_history}
Human: {question}
Assistant:"""

prompt = PromptTemplate(
    input_variables=["context", "chat_history", "question"], template=template
)

def get_chat_history(inputs) -> str:
    res = []
    for human, ai in inputs:
        res.append(f"Human:{human}\nAI:{ai}")
    return "\n".join(res)

chain = ConversationalRetrievalChain.from_llm(
    llm=llm, retriever=vectorstore.as_retriever(), memory=memory,
    get_chat_history=get_chat_history, return_source_documents=True,
    combine_docs_chain_kwargs={'prompt': prompt})


In [None]:
from langchain.chains import ConversationalRetrievalChain

qa = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)
chat_history = []
query = "What is the full recipe for the borst?"
result = qa({"question": query, "chat_history": chat_history})
print(result['answer'])
print(result['source_documents'][0].metadata["source"])

In [None]:



query = "Can you provide me with the full recipe?"

qa = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), chat_history=chat_history)