In [1]:
pip install python-dotenv

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:

#make your own .env file and then use the following

import os
from dotenv import load_dotenv

load_dotenv(dotenv_path='my.env')


assert os.getenv("OPENAI_API_KEY"), "Set OPENAI_API_KEY in my.env file"

# Your code using the environment variables
langchain_tracing_v2 = os.getenv("LANGCHAIN_TRACING_V2")
langchain_endpoint = os.getenv("LANGCHAIN_ENDPOINT")
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
openai_api_key = os.getenv("OPENAI_API_KEY")



**Collect and Load dataset**

In [3]:

import requests
from bs4 import BeautifulSoup

# URL of the webpage
#url = "https://myscale.com/blog/mastering-rag-predictive-analytics-step-by-step-guide/"
url = "https://www.newyorker.com/magazine/2006/08/28/manifold-destiny"
# Fetch the webpage content
response = requests.get(url)
web_content = response.content

# Parse the content with BeautifulSoup
soup = BeautifulSoup(web_content, 'html.parser')

# Extract the main text from the webpage
# This example assumes that the main content is within <article> tags, adjust as necessary
article = soup.find('article')
if article:
    text_content = article.get_text(separator='\n', strip=True)
else:
    text_content = "Article content not found."

# Save the extracted text to a document
with open("document.txt", "w", encoding='utf-8') as file:
    file.write(text_content)

print("Text content has been saved to document.txt")


Text content has been saved to document.txt


In [4]:
from langchain.document_loaders import TextLoader


loader = TextLoader("./document.txt")    
documents = loader.load()




Notice that seperator is important to chunk the string 

In [5]:
from langchain.text_splitter import CharacterTextSplitter

# Sample text
text = "This is a long text that we want to split into smaller chunks for processing. Each chunk will have a maximum number of characters."

# Initialize the CharacterTextSplitter
splitter = CharacterTextSplitter(
    chunk_size=50,  # Maximum number of characters in each chunk
    chunk_overlap=10,  # Number of overlapping characters between chunks
    separator=" "  # Split by spaces
)

# Split the text
chunks = splitter.split_text(text)

# Output the chunks
for chunk in chunks:
    print(chunk)
    print("=" * 50)


This is a long text that we want to split into
split into smaller chunks for processing. Each
Each chunk will have a maximum number of
number of characters.


In [6]:
 from langchain.text_splitter import CharacterTextSplitter
# text_splitter = CharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
# chunks = text_splitter.split_documents(documents[0].page_content)
# print(len(chunks))
# chunks
# Initialize the CharacterTextSplitter with desired chunk size and overlap
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50, separator="\n")

# Split the document into chunks
chunks = text_splitter.split_documents(documents)

# Output the number of chunks and the chunks themselves
#print("Number of chunks:", len(chunks))


Created a chunk of size 525, which is longer than the specified 500
Created a chunk of size 933, which is longer than the specified 500
Created a chunk of size 1018, which is longer than the specified 500
Created a chunk of size 1185, which is longer than the specified 500
Created a chunk of size 679, which is longer than the specified 500
Created a chunk of size 878, which is longer than the specified 500
Created a chunk of size 773, which is longer than the specified 500
Created a chunk of size 850, which is longer than the specified 500
Created a chunk of size 1019, which is longer than the specified 500
Created a chunk of size 507, which is longer than the specified 500
Created a chunk of size 1103, which is longer than the specified 500
Created a chunk of size 706, which is longer than the specified 500
Created a chunk of size 664, which is longer than the specified 500
Created a chunk of size 1157, which is longer than the specified 500
Created a chunk of size 905, which is longe

In [7]:
chunks[5]

Document(page_content='Grigory Perelman is indeed reclusive. He left his job as a researcher at the Steklov Institute of Mathematics, in St. Petersburg, last December; he has few friends; and he lives with his mother in an apartment on the outskirts of the city. Although he had never granted an interview before, he was cordial and frank when we visited him, in late June, shortly after Yau’s conference in Beijing, taking us on a long walking tour of the city. “I’m looking for some friends, and they don’t have to be mathematicians,” he said. The week before the conference, Perelman had spent hours discussing the Poincaré conjecture with Sir John M. Ball, the fifty-eight-year-old president of the International Mathematical Union, the discipline’s influential professional association. The meeting, which took place at a conference center in a stately mansion overlooking the Neva River, was highly unusual. At the end of May, a committee of nine prominent mathematicians had voted to award Per

In [8]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

[0m

In [9]:
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


# Embed
vectorstore = Chroma.from_documents(documents=chunks, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [10]:
from langchain import PromptTemplate

template = """You are an assistant for question-answering tasks.
            Use the following pieces of retrieved context to answer the question.
            If you don't know the answer, just say that you don't know. Use three sentence 
            maximum and keep the answer concise. 
Question: {question}
Context: {context}"\n
"""

prompt = PromptTemplate(
     template = template,
     input_variables = ["context","question"]
)
print(prompt)

input_variables=['context', 'question'] template='You are an assistant for question-answering tasks.\n            Use the following pieces of retrieved context to answer the question.\n            If you don\'t know the answer, just say that you don\'t know. Use three sentence \n            maximum and keep the answer concise. \nQuestion: {question}\nContext: {context}"\n\n'


**Generate**

In [11]:
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [12]:
model = ChatOpenAI()
output_parser = StrOutputParser()

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
     |prompt 
     | model 
     | output_parser
)
#chain.invoke({"topic": "Melbourne"})

In [13]:
pip install datasets


[0mNote: you may need to restart the kernel to use updated packages.


In [14]:

#from datasets import Dataset
from datasets import Dataset, Features, Sequence, Value


questions = ["What university did Yau work at?", "Who is Perelman?", "What is Poincare conjecture?"]
ground_truths = ['Yau is a Professor of mathematics at Harvard.','Perelman is a mathematician', "century-old conundrum about the characteristics of three-dimensional spheres,"]

answers = []
contexts= []

#Inference
for query in questions:
    answers.append(rag_chain.invoke(query))
    # for docs in retriever.get_relevant_documents(query):
    #     contexts.append([docs.page_content])
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])
# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts":contexts,
    "ground_truth": ground_truths
    
}

#Convert dict to dataset 
#dataset 



  from .autonotebook import tqdm as notebook_tqdm
  warn_deprecated(


In [15]:
data

{'question': ['What university did Yau work at?',
  'Who is Perelman?',
  'What is Poincare conjecture?'],
 'answer': ['Yau worked as a professor of mathematics at Harvard University.',
  'Perelman is a mathematician known for his work on the Poincaré conjecture. He is reclusive, left his job at the Steklov Institute of Mathematics, and lives in St. Petersburg with his mother. Perelman became widely recognized for his work on geometric spaces and his approach to problem-solving in mathematics.',
  'The Poincaré conjecture was proposed by Henri Poincaré over a hundred years ago and remained unsolved for many years. It was finally proven in all dimensions except the third by 1982. The Clay Mathematics Institute named it one of the seven most important outstanding problems in mathematics in 2000.'],
 'contexts': [['Yau studied math at the Chinese University of Hong Kong, where he attracted the attention of Shiing-Shen Chern, the preëminent Chinese mathematician, who helped him win a schol

In [16]:
dataset = Dataset.from_dict(data)

In [17]:
data

{'question': ['What university did Yau work at?',
  'Who is Perelman?',
  'What is Poincare conjecture?'],
 'answer': ['Yau worked as a professor of mathematics at Harvard University.',
  'Perelman is a mathematician known for his work on the Poincaré conjecture. He is reclusive, left his job at the Steklov Institute of Mathematics, and lives in St. Petersburg with his mother. Perelman became widely recognized for his work on geometric spaces and his approach to problem-solving in mathematics.',
  'The Poincaré conjecture was proposed by Henri Poincaré over a hundred years ago and remained unsolved for many years. It was finally proven in all dimensions except the third by 1982. The Clay Mathematics Institute named it one of the seven most important outstanding problems in mathematics in 2000.'],
 'contexts': [['Yau studied math at the Chinese University of Hong Kong, where he attracted the attention of Shiing-Shen Chern, the preëminent Chinese mathematician, who helped him win a schol

In [18]:
dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 3
})

In [19]:
pip install ragas

[0mNote: you may need to restart the kernel to use updated packages.


In [20]:
from ragas import evaluate 
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
)

results = evaluate(
     dataset = dataset,
    metrics = [
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy
    ]
    #llm = 
)

Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00,  1.03it/s]


In [21]:
results

{'context_precision': 0.5278, 'context_recall': 1.0000, 'faithfulness': 1.0000, 'answer_relevancy': 0.9107}

In [22]:
results.to_pandas()

Unnamed: 0,question,answer,contexts,ground_truth,context_precision,context_recall,faithfulness,answer_relevancy
0,What university did Yau work at?,Yau worked as a professor of mathematics at Ha...,[Yau studied math at the Chinese University of...,Yau is a Professor of mathematics at Harvard.,0.25,1.0,1.0,0.921329
1,Who is Perelman?,Perelman is a mathematician known for his work...,"[By the end of his first year at Berkeley, Per...",Perelman is a mathematician,1.0,1.0,1.0,0.88504
2,What is Poincare conjecture?,The Poincaré conjecture was proposed by Henri ...,[Proofs of the Poincaré have been announced ne...,century-old conundrum about the characteristic...,0.333333,1.0,1.0,0.925673
