In [109]:
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_openai import OpenAI
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
import json
import os
import os
import json
import json
from pathlib import Path
from pprint import pprint



In [110]:
from langchain_community.document_loaders import JSONLoader

os.chdir(r"E:\ML and Data Science work\Challenge\datawars-llm-challenges")
print("Current Working Directory:", os.getcwd())
file_path = os.path.join("chunks_output", "chunks.json")

loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',
    text_content=False)

docs = loader.load()

Current Working Directory: E:\ML and Data Science work\Challenge\datawars-llm-challenges


In [111]:

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

embedding = OpenAIEmbeddings()

In [112]:
vectorestore = Chroma.from_documents(docs, embedding)

# Step 1: Retrieve Relevant Documents

In [113]:
base_retriever = vectorestore.as_retriever(search_kwargs={"k": 10})

In [114]:
response = base_retriever.invoke("How to group a datafram in panda?")

print("Retrieve Relevant Documents")
for result in response:
   
    print("Content:", result.page_content)
    print()


Retrieve Relevant Documents
Content: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". So let's see what this would look like. So I can say and I'll just call this data frame Python df. And now I'm gonna create a data frame where we concat those 2 series into 1. So I can say pd.concat, and now I'm gonna pass in a list of the series that we want to concatenate. So I want this to be our country respondents and I also want to add in this country uses Python series. And now, we also want to set axis equal to columns because by default, it's going to try to concatenate these on row, but we wanna match up the indexes here so that it concats it, that way instead. So we wanna say axis is equal to columns. And then finally, I'm also gonna put sort is equal to false"}

Content: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". So let's see what this would look l

# Step 2: Build the QA Chain

In [115]:
from operator import itemgetter
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

primary_qa_llm = ChatOpenAI(model_name="gpt-4", temperature=0)

template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

### CONTEXT
{context}

### QUESTION
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | base_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

question = "How to group a dataframe in pandas?"

result = retrieval_augmented_qa_chain.invoke({"question": question})

print("Generated Response:", result)


Generated Response: {'response': AIMessage(content='To group a dataframe in pandas, you can use the groupby function. You can pass in a list of columns that you want to group on to the groupby function. For example, if you want to group all of the results by country, you can simply say, df.groupby, and then pass in a single column for country. This will return a data frame group by object.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 78, 'prompt_tokens': 2657, 'total_tokens': 2735}, 'model_name': 'gpt-4-0613', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-1b1d32e3-9f58-458a-bd4d-c05e661fb7e7-0', usage_metadata={'input_tokens': 2657, 'output_tokens': 78, 'total_tokens': 2735}), 'context': [Document(metadata={'seq_num': 297, 'source': 'E:\\ML and Data Science work\\Challenge\\datawars-llm-challenges\\chunks_output\\chunks.json'}, page_content='{"title": "Python Pandas Tutorial (Part 8): Grouping and Aggre

# Step 3: Generate Questions


In [116]:
from tqdm import tqdm
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

question_schema = ResponseSchema(
    name="question",
    description="A question about the context."
)

question_response_schemas = [question_schema]
question_output_parser = StructuredOutputParser.from_response_schemas(question_response_schemas)
format_instructions = question_output_parser.get_format_instructions()

qa_template = """
You are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.

Format the output as JSON with the following key:
question

Context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)
bare_template = ChatPromptTemplate.from_template(template="{content}")
question_generation_llm = ChatOpenAI(model="gpt-4")

question_generation_chain = bare_template | question_generation_llm


qac_triples = []

for text in tqdm(response):
    messages = prompt_template.format_messages(
        context=text.page_content,
        format_instructions=format_instructions
    )
    response = question_generation_chain.invoke({"content": messages})
    try:
        output_dict = question_output_parser.parse(response.content)
        print("Generated Question:", output_dict)
    except Exception as e:
        print(f"Error generating question: {e}")
        continue
    output_dict["context"] = text
    qac_triples.append(output_dict)
    print(messages)


 10%|█         | 1/10 [00:03<00:32,  3.58s/it]

Generated Question: {'question': "In the context of Python Pandas, describe how to concatenate two series into one data frame and explain the role of setting the axis equal to columns. Also, clarify the usage of 'sort is equal to false' in the process."}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". So let\'s see what this would look like. So I can say and I\'ll just call this data frame Python df. And now I\'m gonna create a data frame where we concat those 2 series into 1. So I can say pd.concat, and now I\'m gonna pass in a list of the series that we want to concatenate. So I want this to be our c

 20%|██        | 2/10 [00:06<00:27,  3.41s/it]

Generated Question: {'question': 'Given the tutorial on grouping and aggregating in Python Pandas, how would you create a DataFrame where two series are concatenated into one? What parameters would you pass in the pd.concat function to ensure that the series are concatenated by columns and not by rows, and to prevent sorting?'}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". So let\'s see what this would look like. So I can say and I\'ll just call this data frame Python df. And now I\'m gonna create a data frame where we concat those 2 series into 1. So I can say pd.concat, and now I\'m gonna pass in a

 30%|███       | 3/10 [00:10<00:24,  3.45s/it]

Generated Question: {'question': "In the context of the tutorial, you are given two series: 'country respondents' and 'country uses Python'. How would you concatenate these two series into one data frame using pandas, considering that you want to match up the indexes and not sort the result?"}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". So let\'s see what this would look like. So I can say and I\'ll just call this data frame Python df. And now I\'m gonna create a data frame where we concat those 2 series into 1. So I can say pd.concat, and now I\'m gonna pass in a list of the series that we want to

 40%|████      | 4/10 [00:14<00:23,  3.89s/it]

Generated Question: {'question': "In the context of Python Pandas, as discussed in the tutorial 'Part 8: Grouping and Aggregating - Analyzing and Exploring Your Data', explain the function of 'pd.concat' when used with the parameters 'axis = columns' and 'sort = false'. What would be the expected output if you were to concatenate two series, 'country respondents' and 'country uses Python series', using these parameters?"}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". So let\'s see what this would look like. So I can say and I\'ll just call this data frame Python df. And now I\'m gonna create a data f

 50%|█████     | 5/10 [00:21<00:25,  5.02s/it]

Generated Question: {'question': "In the context of the tutorial 'Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data', what is the purpose of the df.groupby function and how is it used when analyzing data by country? Provide a detailed explanation with a code example."}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". So first, let\'s look at splitting the object. Now, in this case, we want to group all of the results by country. So to do this, we can simply say, df.groupby, and then we will pass in, this is going to be a list of columns that we want to group o

 60%|██████    | 6/10 [00:25<00:17,  4.36s/it]

Generated Question: {'question': "Explain the steps and process to use the 'groupby' function in pandas to group data by the 'country' column. What is the 'data frame group by' object and how can it be utilized?"}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". Okay. So now let\'s look at how to use the group by function on our country column. So first we\'re going to split the object, and then we\'re going to apply a function, and then it will combine those results. So first, let\'s look at splitting the object. Now, in this case, we want to group all of the results by country. So to do this, we can s

 70%|███████   | 7/10 [00:28<00:12,  4.11s/it]

Generated Question: {'question': "Explain the process of using the groupby function on a DataFrame in Python Pandas. How does it work when applied on a single column such as 'country'? What is a DataFrame GroupBy object and what operations can be performed on it?"}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". Okay. So now let\'s look at how to use the group by function on our country column. So first we\'re going to split the object, and then we\'re going to apply a function, and then it will combine those results. So first, let\'s look at splitting the object. Now, in this case, we want to group al

 80%|████████  | 8/10 [00:32<00:08,  4.12s/it]

Generated Question: {'question': "What is the process and syntax for grouping data based on the 'country' column in a DataFrame using Python Pandas, as explained in the tutorial Part 8: Grouping and Aggregating - Analyzing and Exploring Your Data?"}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". Okay. So now let\'s look at how to use the group by function on our country column. So first we\'re going to split the object, and then we\'re going to apply a function, and then it will combine those results. So first, let\'s look at splitting the object. Now, in this case, we want to group all of the results

 90%|█████████ | 9/10 [00:35<00:03,  3.76s/it]

Generated Question: {'question': "In the given Python Pandas tutorial, the group by function is used on the 'country' column. Can you explain the sequence of actions performed after applying the group by function, and also describe the result obtained after its execution?"}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". Okay. So now let\'s look at how to use the group by function on our country column. So first we\'re going to split the object, and then we\'re going to apply a function, and then it will combine those results. So first, let\'s look at splitting the object. Now, in this case, we want to

100%|██████████| 10/10 [00:40<00:00,  4.01s/it]

Generated Question: {'question': 'In the context of the Python Pandas tutorial part 8, how would you use the groupby function to group all the survey results by country name, specifically for India? Also, explain how this process is similar to running a filter on the original dataframe?'}
[HumanMessage(content='\nYou are a University Professor creating a test from pandas and matplotlip tutorials for advanced students. For each context, create a question that is specific to the context. Avoid creating generic or general questions.\n\nFormat the output as JSON with the following key:\nquestion\n\nContext: {"title": "Python Pandas Tutorial (Part 8): Grouping and Aggregating - Analyzing and Exploring Your Data", "text": ". And if I look at the group for India, so if I instead change United States to India here and grab that group, if we look at the country here, then these are all the survey results for people who said that they were from India. So that\'s what our data frame group by obje




# Step 4: Generate Answers

In [117]:
answer_schema = ResponseSchema(
    name="answer",
    description="An answer to the question."
)

answer_response_schemas = [answer_schema]
answer_output_parser = StructuredOutputParser.from_response_schemas(answer_response_schemas)
format_instructions = answer_output_parser.get_format_instructions()

qa_template = """\
You are a University Professor creating a test for advanced students. For each question and context, create an answer.

Format the output as JSON with the following key:
answer

Question: {question}
Context: {context}
"""

prompt_template = ChatPromptTemplate.from_template(template=qa_template)

answer_generation_llm = ChatOpenAI(model="gpt-4", temperature=0)

answer_generation_chain = bare_template | answer_generation_llm

for triple in tqdm(qac_triples):
    messages = prompt_template.format_messages(
        context=triple["context"].page_content,
        question=triple["question"],
        format_instructions=format_instructions
    )
    response = answer_generation_chain.invoke({"content": messages})
    try:
        output_dict = answer_output_parser.parse(response.content)
        triple["answer"] = output_dict["answer"]
    except Exception as e:
        print(f"Error generating answer: {e}")


100%|██████████| 10/10 [01:44<00:00, 10.43s/it]


# Step 5: Preparing Ground Truth Dataset


In [118]:
import pandas as pd

ground_truth_qac_set = pd.DataFrame(qac_triples)
ground_truth_qac_set["context"] = ground_truth_qac_set["context"].map(lambda x: str(x.page_content))
ground_truth_qac_set = ground_truth_qac_set.rename(columns={"answer": "ground_truth"})

print(ground_truth_qac_set.head())

ground_truth_qac_set.to_csv("groundtruth_eval_dataset.csv")


                                            question  \
0  In the context of Python Pandas, describe how ...   
1  Given the tutorial on grouping and aggregating...   
2  In the context of the tutorial, you are given ...   
3  In the context of Python Pandas, as discussed ...   
4  In the context of the tutorial 'Python Pandas ...   

                                             context  \
0  {"title": "Python Pandas Tutorial (Part 8): Gr...   
1  {"title": "Python Pandas Tutorial (Part 8): Gr...   
2  {"title": "Python Pandas Tutorial (Part 8): Gr...   
3  {"title": "Python Pandas Tutorial (Part 8): Gr...   
4  {"title": "Python Pandas Tutorial (Part 8): Gr...   

                                        ground_truth  
0  In Python Pandas, concatenating two series int...  
1  To create a DataFrame where two series are con...  
2  To concatenate the two series into one data fr...  
3  In the context of Python Pandas, the 'pd.conca...  
4  The df.groupby function in pandas is used to s..

# Step 6: Create RAG Dataset for Evaluation

In [119]:
from tqdm import tqdm
from datasets import Dataset

def create_ragas_dataset(rag_pipeline, eval_dataset):
    rag_dataset = []
    for _, row in tqdm(eval_dataset.iterrows(), total=len(eval_dataset)):
        answer = rag_pipeline.invoke({"question": row["question"]})
        rag_dataset.append(
            {
                "question": row["question"],
                "answer": answer["response"].content,
                "contexts": [context.page_content for context in answer["context"]],
                "ground_truth": str(row["ground_truth"]),
            }
        )
    rag_df = pd.DataFrame(rag_dataset)
    rag_eval_dataset = Dataset.from_pandas(rag_df)
    return rag_eval_dataset

basic_qa_ragas_dataset = create_ragas_dataset(retrieval_augmented_qa_chain, ground_truth_qac_set)
basic_qa_ragas_dataset.to_csv("basic_qa_ragas_dataset.csv")


100%|██████████| 10/10 [01:45<00:00, 10.53s/it]


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

111247

# Step 7: Evaluate RAG Dataset

In [120]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity
)
from ragas import evaluate

def evaluate_ragas_dataset(ragas_dataset):
    result = evaluate(
        ragas_dataset,
        metrics=[
            context_precision,
            faithfulness,
            answer_relevancy,
            context_recall,
            answer_correctness,
            answer_similarity,
        ],
    )
    return result

basic_qa_result = evaluate_ragas_dataset(basic_qa_ragas_dataset)
print("Evaluation Result:", basic_qa_result)


Evaluating:   0%|          | 0/60 [00:00<?, ?it/s]

Evaluation Result: {'context_precision': 0.9977, 'faithfulness': 0.9449, 'answer_relevancy': 0.8983, 'context_recall': 0.7999, 'answer_correctness': 0.7227, 'answer_similarity': 0.9715}


•	**Context Precision:** Measures how much of the retrieved information is actually relevant to the question.

•	**Context Recall:** Measures how much of the relevant information is retrieved.

•	**Faithfulness:** Measures how factually accurate the generated answer is based on the retrieved information.

•	**Answer Relevancy:** This represents how relevant the answers are in relation to the questions.

•   **Answer Similarity:** This measures how similar the generated answers are to the expected or ground truth answers.

•   **Answer Correctness:** This indicates the correctness of the generated answers.