# LangChain: Evaluation

## Outline:

* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation
* LangChain evaluation platform

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

Note: LLM's do not always produce the same results. When executing the code in your notebook, you may get slightly different answers that those in the video.

In [2]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

## Create our QandA application

In [3]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch

In [4]:
file = 'datasets/ts_discography.csv'
loader = CSVLoader(file_path=file)
data = loader.load()

In [5]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch
).from_loaders([loader])

  warn_deprecated(


In [6]:
llm = ChatOpenAI(temperature = 0.0, model=llm_model)
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

### Coming up with test datapoints

In [17]:
dict(data[105])['page_content'][:350]

"album_title: Red (Deluxe Version)\nalbum_url: https://genius.com/albums/Taylor-Swift/Red-Deluxe-Version\ncategory: Red\nalbum_track_number: 10\nsong_title: The Last Time (Ft. Gary Lightbody)\nsong_url: https://genius.com/Taylor-swift-the-last-time-lyrics\nsong_artists: ['Taylor Swift', 'Gary Lightbody']\nsong_release_date: 2012-10-22\nsong_page_views: 2206"

In [18]:
dict(data[104])['page_content'][:350]

"album_title: Red (Deluxe Version)\nalbum_url: https://genius.com/albums/Taylor-Swift/Red-Deluxe-Version\ncategory: Red\nalbum_track_number: 9\nsong_title: Stay Stay Stay\nsong_url: https://genius.com/Taylor-swift-stay-stay-stay-lyrics\nsong_artists: ['Taylor Swift']\nsong_release_date: 2012-10-22\nsong_page_views: 139300\nsong_lyrics: I'm pretty sure we alm"

### Hard-coded examples

In [19]:
examples = [
    {
        "query": "What is the first song in the album Red (Deluxe Version)?",
        "answer": "State of Grace"
    },
    {
        "query": "Who is the most recent song released?",
        "answer": "October 27th, 2023"
    }
]

### LLM-Generated examples

In [20]:
from langchain.evaluation.qa import QAGenerateChain

In [21]:
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model))

In [22]:
# the warning below can be safely ignored

In [23]:
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[5:10]]
)



In [24]:
query_answer_list = [{'query': item['qa_pairs']['query'], 'answer': item['qa_pairs']['answer']} for item in new_examples]

In [28]:
dict(data[0])['page_content'][:300]

"album_title: Taylor Swift\nalbum_url: https://genius.com/albums/Taylor-Swift/Taylor-Swift\ncategory: Taylor Swift\nalbum_track_number: 1\nsong_title: Tim McGraw\nsong_url: https://genius.com/Taylor-swift-tim-mcgraw-lyrics\nsong_artists: ['Taylor Swift']\nsong_release_date: 2006-06-19\nsong_page_views: 24140"

### Combine examples

In [30]:
examples += query_answer_list

In [37]:
qa.invoke(examples[3]["query"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'Who wrote and produced the song "Tied Together with a Smile" by Taylor Swift?',
 'result': 'The song "Tied Together with a Smile" by Taylor Swift was written by Angelo Petraglia, Robert Ellis Orrall, and Taylor Swift, and produced by Angelo Petraglia and Robert Ellis Orrall.'}

## Manual Evaluation

In [38]:
import langchain
langchain.debug = True

In [39]:
qa.run(examples[3]["query"])

  warn_deprecated(


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Who wrote and produced the song \"Tied Together with a Smile\" by Taylor Swift?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who wrote and produced the song \"Tied Together with a Smile\" by Taylor Swift?",
  "context": "album_title: Taylor Swift\nalbum_url: https://genius.com/albums/Taylor-Swift/Taylor-Swift\ncategory: Taylor Swift\nalbum_track_number: 7\nsong_title: Tied Together with a Smile\nsong_url: https://genius.com/Taylor-swift-tied-together-with-a-smile-lyrics\nsong_artists: ['Taylor Swift']\nsong_release_date: 2006-10-24\nsong_page_views: 106500<<<<>>>>>song_writers: ['Angelo Petraglia', 'Robert Ellis Orrall', 'Taylor Swift']\ns

'The song "Tied Together with a Smile" by Taylor Swift was written by Angelo Petraglia, Robert Ellis Orrall, and Taylor Swift, and produced by Angelo Petraglia and Robert Ellis Orrall.'

In [40]:
# Turn off the debug mode
langchain.debug = False

## LLM assisted evaluation

In [41]:
predictions = qa.batch(inputs=examples)



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


In [42]:
from langchain.evaluation.qa import QAEvalChain

In [43]:
llm = ChatOpenAI(temperature=0, model=llm_model)
eval_chain = QAEvalChain.from_llm(llm)

In [44]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [45]:
graded_outputs

[{'results': 'INCORRECT'},
 {'results': 'INCORRECT'},
 {'results': 'CORRECT'},
 {'results': 'INCORRECT'},
 {'results': 'INCORRECT'},
 {'results': 'CORRECT'},
 {'results': 'CORRECT'}]

In [46]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['results'])
    print()

Example 0:
Question: What is the first song in the album Red (Deluxe Version)?
Real Answer: State of Grace
Predicted Answer: The first song in the album Red (Deluxe Version) is not provided in the given context. The album track number for the song is also not in order. Therefore, I don't have the answer to your question.
Predicted Grade: INCORRECT

Example 1:
Question: Who is the most recent song released?
Real Answer: October 27th, 2023
Predicted Answer: The most recent song released is "Only The Young" by Taylor Swift, released on January 31, 2020.
Predicted Grade: INCORRECT

Example 2:
Question: What is the release date of the Taylor Swift song "The Outside"?
Real Answer: The release date of the Taylor Swift song "The Outside" is October 24, 2006.
Predicted Answer: The release date of the Taylor Swift song "The Outside" is 2006-10-24.
Predicted Grade: CORRECT

Example 3:
Question: Who wrote and produced the song "Tied Together with a Smile" by Taylor Swift?
Real Answer: The song "Ti