# LangChain: Evaluation

## Outline:

* Example generation
* Manual evaluation (and debuging)
* LLM-assisted evaluation

In [1]:
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

In [2]:
# Set OpenAI API key
os.environ["OPENAI_API_TYPE"] = os.getenv("api_type")
os.environ["OPENAI_API_BASE"] = os.getenv("api_base")
os.environ["OPENAI_API_VERSION"] = os.getenv("api_version")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## Create our QandA application

In [8]:
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings import OpenAIEmbeddings

In [15]:
file = 'OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file, encoding='utf-8')
data = loader.load()

In [16]:
embeddings = OpenAIEmbeddings(model = "text-embedding-ada-002",chunk_size=1)

In [17]:
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding = embeddings
).from_loaders([loader])

In [18]:
llm = AzureChatOpenAI(deployment_name="chatgpt-gpt35-turbo",model_name="gpt-35-turbo",temperature=0.0)
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=index.vectorstore.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {
        "document_separator": "<<<<>>>>>"
    }
)

### Coming up with test datapoints

In [20]:
data[2]

Document(lc_kwargs={'page_content': ": 2\nname: Infant and Toddler Girls' Coastal Chill Swimsuit, Two-Piece\ndescription: She'll love the bright colors, ruffles and exclusive whimsical prints of this toddler's two-piece swimsuit! Our four-way-stretch and chlorine-resistant fabric keeps its shape and resists snags. The UPF 50+ rated fabric provides the highest rated sun protection possible, blocking 98% of the sun's harmful rays. The crossover no-slip straps and fully lined bottom ensure a secure fit and maximum coverage. Machine wash and line dry for best results. Imported.", 'metadata': {'source': 'OutdoorClothingCatalog_1000.csv', 'row': 2}}, page_content=": 2\nname: Infant and Toddler Girls' Coastal Chill Swimsuit, Two-Piece\ndescription: She'll love the bright colors, ruffles and exclusive whimsical prints of this toddler's two-piece swimsuit! Our four-way-stretch and chlorine-resistant fabric keeps its shape and resists snags. The UPF 50+ rated fabric provides the highest rated su

In [21]:
data[3]

Document(lc_kwargs={'page_content': ": 3\nname: Refresh Swimwear, V-Neck Tankini Contrasts\ndescription: Whether you're going for a swim or heading out on an SUP, this watersport-ready tankini top is designed to move with you and stay comfortable. All while looking great in an eye-catching colorblock style. \n\nSize & Fit\nFitted: Sits close to the body.\n\nWhy We Love It\nNot only does this swimtop feel good to wear, its fabric is good for the earth too. In recycled nylon, with Lycra® spandex for the perfect amount of stretch. \n\nFabric & Care\nThe premium Italian-blend is breathable, quick drying and abrasion resistant. \nBody in 82% recycled nylon with 18% Lycra® spandex. \nLined in 90% recycled nylon with 10% Lycra® spandex. \nUPF 50+ rated – the highest rated sun protection possible. \nHandwash, line dry.\n\nAdditional Features\nLightweight racerback straps are easy to get on and off, and won't get in your way. \nFlattering V-neck silhouette. \nImported.\n\nSun Protection That Wo

### Hard-coded examples

In [22]:
examples = [
    {
        "query": "Do the Cozy Comfort Pullover Set\
        have side pockets?",
        "answer": "Yes"
    },
    {
        "query": "What collection is the Ultra-Lofty \
        850 Stretch Down Hooded Jacket from?",
        "answer": "The DownTek collection"
    }
]

### LLM-Generated examples

In [23]:
from langchain.evaluation.qa import QAGenerateChain


In [24]:
example_gen_chain = QAGenerateChain.from_llm(llm)

In [25]:
new_examples = example_gen_chain.apply_and_parse(
    [{"doc": t} for t in data[:5]]
)

In [26]:
new_examples[0]

{'query': "What is the weight of the Women's Campside Oxfords per pair?",
 'answer': "The Women's Campside Oxfords weigh approximately 1 lb. 1 oz. per pair."}

In [27]:
data[0]

Document(lc_kwargs={'page_content': ": 0\nname: Women's Campside Oxfords\ndescription: This ultracomfortable lace-to-toe Oxford boasts a super-soft canvas, thick cushioning, and quality construction for a broken-in feel from the first time you put them on. \n\nSize & Fit: Order regular shoe size. For half sizes not offered, order up to next whole size. \n\nSpecs: Approx. weight: 1 lb.1 oz. per pair. \n\nConstruction: Soft canvas material for a broken-in feel and look. Comfortable EVA innersole with Cleansport NXT® antimicrobial odor control. Vintage hunt, fish and camping motif on innersole. Moderate arch contour of innersole. EVA foam midsole for cushioning and support. Chain-tread-inspired molded rubber outsole with modified chain-tread pattern. Imported. \n\nQuestions? Please contact us for any inquiries.", 'metadata': {'source': 'OutdoorClothingCatalog_1000.csv', 'row': 0}}, page_content=": 0\nname: Women's Campside Oxfords\ndescription: This ultracomfortable lace-to-toe Oxford boa

### Combine examples

In [28]:
examples += new_examples

In [29]:
qa.run(examples[0]["query"])

Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m


"I'm sorry, but I don't have any information about the Cozy Comfort Pullover Set. The context provided does not mention this product."

## Manual Evaluation

In [30]:
import langchain
langchain.debug = True

In [31]:
qa.run(examples[0]["query"])

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Do the Cozy Comfort Pullover Set        have side pockets?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:chain:StuffDocumentsChain > 3:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Do the Cozy Comfort Pullover Set        have side pockets?",
  "context": "Additional Features: Three-layer shell delivers waterproof protection. Brand new TEK O2 technology provides enhanced breathability. Interior gaiters keep out rain and snow. Full side zips for easy on/off over boots. Two zippered hand pockets. Thigh pocket. Imported.\n\n – Official Supplier to the U.S. Ski Team\nTHEIR WILL<<<<>>>>>: 5\nname: Smooth Comfort Check Shirt, Slightly Fitted\ndescription: Our men's slightly fitted check shirt is the perfect choice for you

"I'm sorry, but I don't have any information about the Cozy Comfort Pullover Set. The context provided does not mention this product."

In [32]:
# Turn off the debug mode
langchain.debug = False

## LLM assisted evaluation

In [33]:
predictions = qa.apply(examples)

Error in on_chain_start callback: 'name'
Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m


Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m


Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m


Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m


Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m


Error in on_chain_start callback: 'name'



[1m> Finished chain.[0m

[1m> Finished chain.[0m


In [34]:
from langchain.evaluation.qa import QAEvalChain

In [35]:
eval_chain = QAEvalChain.from_llm(llm)

In [36]:
graded_outputs = eval_chain.evaluate(examples, predictions)

In [37]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + predictions[i]['query'])
    print("Real Answer: " + predictions[i]['answer'])
    print("Predicted Answer: " + predictions[i]['result'])
    print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

Example 0:
Question: Do the Cozy Comfort Pullover Set        have side pockets?
Real Answer: Yes
Predicted Answer: I'm sorry, but I don't have any information about the Cozy Comfort Pullover Set. The context provided does not mention this product.
Predicted Grade: INCORRECT

Example 1:
Question: What collection is the Ultra-Lofty         850 Stretch Down Hooded Jacket from?
Real Answer: The DownTek collection
Predicted Answer: There is no information provided about the Ultra-Lofty 850 Stretch Down Hooded Jacket's collection.
Predicted Grade: INCORRECT

Example 2:
Question: What is the weight of the Women's Campside Oxfords per pair?
Real Answer: The Women's Campside Oxfords weigh approximately 1 lb. 1 oz. per pair.
Predicted Answer: The Women's Campside Oxfords weigh approximately 1 lb. 1 oz. per pair.
Predicted Grade: CORRECT

Example 3:
Question: What are the dimensions of the small and medium Recycled Waterhog Dog Mat?
Real Answer: The small Recycled Waterhog Dog Mat has dimensions 