# LangChain: Evaluation and Debug

 Original Source: [LangChain for LLM Application Development](https://learn.deeplearning.ai/langchain/lesson/6/evaluation) 

In [2]:
# !pip install --upgrade langchain openai docarray

In [34]:
import pandas as pd
import langchain
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.evaluation.qa import QAGenerateChain, QAEvalChain

In [41]:
# OpenAI API KEY
api_key = '?'

### Document data source

In [9]:
# File references
src_file = 'dataset/wine_100.csv'
df = pd.read_csv(src_file)
df.to_csv(src_file, index=False)
df.head()

Unnamed: 0,country,title,description,variety,winery
0,Italy,Nicosia 2013 Vulkà Bianco (Etna),"Aromas include tropical fruit, broom, brimston...",White Blend,Nicosia
1,Portugal,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...",Portuguese Red,Quinta dos Avidagos
2,US,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...",Pinot Gris,Rainstorm
3,US,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",Riesling,St. Julian
4,US,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",Pinot Noir,Sweet Cheeks


### Initialize Retrieval QA Chain

In [10]:
# initialize LLM
llm = ChatOpenAI(temperature=0, openai_api_key=api_key)

# Initialize documents
loader = CSVLoader(file_path=src_file)
docs = loader.load()

# initialize embeddings
embedding = OpenAIEmbeddings(openai_api_key=api_key)

# initialize db 
db = DocArrayInMemorySearch.from_documents(
    docs,
    embedding
)

# initialize retriever
retriever = db.as_retriever()

# initialize chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # map_reduce, refine, map_rerank
    retriever=retriever,
    verbose=True
)

### Coming up with test datapoints

In [14]:
docs[1]

Document(page_content="country: Portugal\ntitle: Quinta dos Avidagos 2011 Avidagos Red (Douro)\ndescription: This is ripe and fruity, a wine that is smooth while still structured. Firm tannins are filled out with juicy red berry fruits and freshened with acidity. It's  already drinkable, although it will certainly be better from 2016.\nvariety: Portuguese Red\nwinery: Quinta dos Avidagos", metadata={'source': 'dataset/wine_100.csv', 'row': 1})

In [15]:
docs[2]

Document(page_content='country: US\ntitle: Rainstorm 2013 Pinot Gris (Willamette Valley)\ndescription: Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.\nvariety: Pinot Gris\nwinery: Rainstorm', metadata={'source': 'dataset/wine_100.csv', 'row': 2})

In [16]:
examples = [
    {
        "query": "Is the wine 'Quinta dos Avidagos 2011 Avidagos Red (Douro)' made in Portugal?",
        "answer": "Yes"
    },
    {
        "query": "Which predominant flavours are perceived in the wine 'Rainstorm 2013 Pinot Gris (Willamette Valley)'?",
        "answer": "Lime flesh and rind."
    }
]

### LLM-Generated examples

In [20]:
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(openai_api_key=api_key))

In [21]:
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in docs[:5]]
)



In [22]:
new_examples[0]

{'qa_pairs': {'query': 'What are some of the aromas found in the Nicosia 2013 Vulkà Bianco wine?',
  'answer': 'Some of the aromas found in the Nicosia 2013 Vulkà Bianco wine include tropical fruit, broom, brimstone, and dried herb.'}}

In [23]:
new_examples[1]

{'qa_pairs': {'query': 'What is the description of the Quinta dos Avidagos 2011 Avidagos Red wine?',
  'answer': 'The description of the Quinta dos Avidagos 2011 Avidagos Red wine is that it is ripe and fruity, smooth while still structured. It has firm tannins filled out with juicy red berry fruits and freshened with acidity. The wine is already drinkable, although it will certainly be better from 2016.'}}

### Combine examples

In [25]:
combined_examples = examples + new_examples

In [26]:
qa_chain.run(combined_examples[0]["query"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


"Yes, the wine 'Quinta dos Avidagos 2011 Avidagos Red (Douro)' is made in Portugal."

### Manual evaluation

In [29]:
langchain.debug = True

In [30]:
qa_chain.run(examples[1]["query"])

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Which predominant flavours are perceived in the wine 'Rainstorm 2013 Pinot Gris (Willamette Valley)'?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Which predominant flavours are perceived in the wine 'Rainstorm 2013 Pinot Gris (Willamette Valley)'?",
  "context": "country: US\ntitle: Rainstorm 2013 Pinot Gris (Willamette Valley)\ndescription: Tart and snappy, the flavors of lime flesh and rind dominate. Some green pineapple pokes through, with crisp acidity underscoring the flavors. The wine was all stainless-steel fermented.\nvariety: Pinot Gris\nwinery: Rainstorm\n\ncountry: US\ntitle: Folie à Deux 2015 Pinot Gris (Sonoma Coast)\ndescrip

"The predominant flavors perceived in the wine 'Rainstorm 2013 Pinot Gris (Willamette Valley)' are lime flesh and rind, with some green pineapple poking through."

In [31]:
# Turn off the debug mode
langchain.debug = False

## LLM assisted evaluation

In [33]:
predictions = qa_chain.apply(examples)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


In [35]:
eval_chain = QAEvalChain.from_llm(llm)

In [36]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [40]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print()

Example 0:
Question: Is the wine 'Quinta dos Avidagos 2011 Avidagos Red (Douro)' made in Portugal?
Real Answer: Yes
Predicted Answer: Yes, the wine 'Quinta dos Avidagos 2011 Avidagos Red (Douro)' is made in Portugal.

Example 1:
Question: Which predominant flavours are perceived in the wine 'Rainstorm 2013 Pinot Gris (Willamette Valley)'?
Real Answer: Lime flesh and rind.
Predicted Answer: The predominant flavors perceived in the wine 'Rainstorm 2013 Pinot Gris (Willamette Valley)' are lime flesh and rind, with some green pineapple poking through.
