In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [9]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI

In [16]:
file_path = "./PMSpeech.pdf"
loader = PyPDFLoader(file_path)
documents = loader.load()
documents[0]

Document(metadata={'producer': 'PDF Printer / www.bullzip.com / CP / Freeware Edition', 'creator': 'PyPDF', 'creationdate': '2017-01-04T21:40:05+05:30', 'moddate': '2017-01-04T21:40:05+05:30', 'title': "Microsoft Word - 104 ISC - PM's Speech.doc", 'author': 'Barkha', 'source': './PMSpeech.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1'}, page_content="PM's Address at the Inauguration of the 104th Session of the Indian Science Congress, \nTirupati \nPMO India  Narendra Modi  Dr. Harsh Vardhan  Y.S. Chowdary  Ashutosh Sharma Iitk  \nDepartment of Science and Technology, Government of India  \nGovernor of Andhra Pradesh, Shri E. S. L. Narasimhan \nChief Minister of Andhra Pradesh, Shri N. Chandrababu Naidu \nUnion Minister for Science & Technology, and Earth Sciences, Dr. Harsh Vardhan \nUnion Minister of State for Science & Technology, and Earth Sciences, Shri Y. S. Chowdary \nGeneral President of the Indian Science Congress Association, Professor D. Narayana Rao \nVice Chancellor o

In [17]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

In [18]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()


In [19]:
from langchain_community.vectorstores import FAISS
vector = FAISS.from_documents(chunks, embeddings)

In [20]:
retriever = vector.as_retriever(search_kwargs={"k": 1})

In [27]:
llm = ChatOpenAI()

In [22]:
template = """Youe are an assistant for question-answering tasks.
Use the following pices of context to answer the question.
If you don't know the answer, Just say that you don't know.
Use two sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

In [24]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [25]:
prompt = ChatPromptTemplate.from_template(template)

In [28]:
# Pipeline

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser() 
)

In [29]:
chain.invoke("Who is the Chief Minister of Andra Pradesh?")

'The Chief Minister of Andhra Pradesh is N. Chandrababu Naidu.'

In [32]:
chain.invoke("Summary of the speech.")

"The speech addressed key challenges and opportunities in sectors such as clean water, energy, food, environment, climate, security, and healthcare, emphasizing the need to leverage disruptive technologies for growth. The speaker also highlighted the importance of assessing and preparing for these challenges and opportunities for the nation's progress."

In [31]:
chain.invoke("Venue of the place this speech arranged?")

'The speech was arranged at the panoramic campus of Sri Venkateswara University in the holy city of Tirupati.'

In [33]:
question = [
    "Who is the Chief Minister of Andra Pradesh?",
    "Summary of the speech.",
    "Venue of the place this speech arranged?"
]

In [41]:
ground_truth = [
    "Chief Minister of Andhra Pradesh, Shri N. Chandrababu Naidu",
    """In the last two sessions of the science congress, I presented before you several key challenges
and opportunities for the nation.
Some of these important challenges are in the key sectors of clean water & energy, food,
environment, climate, security, and healthcare.
We equally need to keep an eye on the rise of disruptive technologies and be prepared to
leverage them for growth. We need to clearly assess the challenges and opportunities for our
technology readiness and competitiveness.
I have been told that the Technology Vision 2035 document released in last year’s Science
Congress, is now developing into a detailed roadmap for twelve key technology sectors.
Further, NITI Aayog is evolving a holistic science and technology vision for the country.
One important area that needs to be addressed is the rapid global rise of Cyber-Physical
Systems. This has the potential to pose unprecedented challenges and stresses to our
demographic dividend. But we can turn it into a huge opportunity by research, training and
skilling in robotics, artificial intelligence, digital manufacturing, big data analysis, deep
learning, quantum communication and Internet-of-Things.
There is a need to develop and exploit these technologies in services and manufacturing
sectors; in agriculture, water, energy & traffic management; health, environment,
infrastructure and Geo Information Systems; security; financial systems and in combating
crime.
We need to develop an Inter-Ministerial National Mission in the Cyber-Physical Systems to
secure our future by creation of basic R&D infrastructure, manpower and skills. 

Our best science and technology institutions should further strengthen their basic research in
line with leading global standards. Translating this basic knowledge into innovations, startups and industry will help us achieve inclusive and sustainable growth.
SCOPUS database indicates that India now ranks sixth in the world with respect to scientific
publications, growing at a rate of about fourteen percent as against the world average growth
rate of about four percent. I am sure that our scientists will further meet the challenges of
enhanced quality of basic research, its technology translation and its societal connect.
By 2030 India will be among the top three countries in science and technology and will be
among the most attractive destinations for the best talent in the world. The wheels we set in
motion today will achieve this goal. """,
"PM's Address at the Inauguration of the 104th Session of the Indian Science Congress, I am happy to inaugurate this 104th session of The Indian Science Congress in the panoramic campus of Sri Venkateswara University. "
]

In [35]:
answer = []
content = []

In [36]:
for query in question:
    answer.append(chain.invoke(query))
    content.append([docs.page_content for docs in retriever.invoke(query)])

In [42]:
data = {
    "question": question,
    "ground_truth": ground_truth,
    "answer": answer,
    "contexts": content    
}

In [46]:
from datasets import Dataset
dataset = Dataset.from_dict(data)

In [48]:
dataset

Dataset({
    features: ['question', 'ground_truth', 'answer', 'contexts'],
    num_rows: 3
})

In [50]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
)

In [52]:
result = evaluate( dataset=dataset,
        metrics = [
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy
        ],
        llm = llm,
        embeddings = embeddings
        )

Evaluating: 100%|██████████| 12/12 [00:10<00:00,  1.15it/s]


In [53]:
result

{'context_precision': 1.0000, 'context_recall': 0.9167, 'faithfulness': 0.8333, 'answer_relevancy': 0.9345}