# Loading the Data

In [14]:
import pandas as pd
from openai import OpenAI
import json
from tqdm.auto import tqdm

In [2]:
df_gpt35 = pd.read_csv('data/results-gpt35.csv')
df_gpt4o = pd.read_csv('data/results-gpt4o.csv')
df_gpt4o_mini = pd.read_csv('data/results-gpt4o-mini.csv')

In [3]:
results_gpt35 = df_gpt35.to_dict(orient='records')
results_gpt4o = df_gpt4o.to_dict(orient='records')
results_gpt4o_mini = df_gpt4o_mini.to_dict(orient='records')

# Functions

In [10]:
client = OpenAI()

def llm(prompt, model='gpt-4o'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

# Prompts

In [4]:
prompt1_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: {answer_orig}
Generated Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the original
answer and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

prompt2_template = """
You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [5]:
df_sample = df_gpt4o_mini.sample(n=150, random_state=1)

In [6]:
samples = df_sample.to_dict(orient='records')

In [7]:
record = samples[0]
record

{'answer_llm': 'The syntax for using `precision_recall_fscore_support` in Python is as follows:\r\n\r\n```python\r\nfrom sklearn.metrics import precision_recall_fscore_support\r\nprecision, recall, fscore, support = precision_recall_fscore_support(y_val, y_val_pred, zero_division=0)\r\n```',
 'answer_orig': 'Scikit-learn offers another way: precision_recall_fscore_support\r\nExample:\r\nfrom sklearn.metrics import precision_recall_fscore_support\r\nprecision, recall, fscore, support = precision_recall_fscore_support(y_val, y_val_pred, zero_division=0)\r\n(Gopakumar Gopinathan)',
 'document': '403bbdd8',
 'question': 'What is the syntax for using precision_recall_fscore_support in Python?',
 'course': 'machine-learning-zoomcamp'}

In [8]:
prompt = prompt1_template.format(**record)
print(prompt)

You are an expert evaluator for a Retrieval-Augmented Generation (RAG) system.
Your task is to analyze the relevance of the generated answer compared to the original answer provided.
Based on the relevance and similarity of the generated answer to the original answer, you will classify
it as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Original Answer: Scikit-learn offers another way: precision_recall_fscore_support
Example:
from sklearn.metrics import precision_recall_fscore_support
precision, recall, fscore, support = precision_recall_fscore_support(y_val, y_val_pred, zero_division=0)
(Gopakumar Gopinathan)
Generated Question: What is the syntax for using precision_recall_fscore_support in Python?
Generated Answer: The syntax for using `precision_recall_fscore_support` in Python is as follows:

```python
from sklearn.metrics import precision_recall_fscore_support
precision, recall, fscore, support = precision_recall_fscore_support(y_val, y_val_

In [11]:

answer = llm(prompt, model='gpt-4o-mini')

In [13]:
json.loads(answer)

{'Relevance': 'RELEVANT',
 'Explanation': 'The generated answer directly addresses the syntax for using precision_recall_fscore_support in Python, which matches the content of the original answer. Both answers include the necessary code snippet and context, making them highly aligned.'}

# Prompt 1

In [15]:
evaluations = []

for record in tqdm(samples):
    prompt = prompt1_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations.append(evaluation)

  0%|          | 0/150 [00:00<?, ?it/s]

In [16]:
json_evaluations = []

for i, str_eval in enumerate(evaluations):
    json_eval = json.loads(str_eval)
    json_evaluations.append(json_eval)

In [17]:
df_evaluations = pd.DataFrame(json_evaluations)

In [18]:
df_evaluations.Relevance.value_counts()

Relevance
RELEVANT           126
PARTLY_RELEVANT     15
NON_RELEVANT         9
Name: count, dtype: int64

In [22]:
df_evaluations[df_evaluations.Relevance == 'NON_RELEVANT'].to_dict(orient='records')

[{'Relevance': 'NON_RELEVANT',
  'Explanation': 'The generated answer addresses a different topic related to Scikit-Learn version conflicts, which is not mentioned in the original answer that focuses on a specific error related to Python version in a Docker build process. Thus, the two answers are not relevant to each other.'},
 {'Relevance': 'NON_RELEVANT',
  'Explanation': "The generated answer addresses a completely unrelated issue concerning the protobuf package, which does not correlate with the original error message regarding the use of 'eb local' with Docker platforms. The original answer provides specific steps to resolve the NotSupportedError related to Docker configurations, while the generated answer is focused on a different context altogether."},
 {'Relevance': 'NON_RELEVANT',
  'Explanation': 'The generated answer incorrectly states that there is no alternative command mentioned in the original answer. The original answer explicitly provides the alternative command to us

In [21]:
samples[4]

 'answer_orig': 'When running docker build -t dino-dragon-model it returns the above error\r\nThe most common source of this error in this week is because Alex video shows a version of the wheel with python 8, we need to find a wheel with the version that we are working on. In this case python 9. Another common error is to copy the link, this will also produce the same error, we need to download the raw format:\r\nhttps://github.com/alexeygrigorev/tflite-aws-lambda/raw/main/tflite/tflite_runtime-2.7.0-cp39-cp39-linux_x86_64.whl\r\nPastor Soto',
 'document': '42c09143',
 'question': "What might be the cause of the pip version error in this week's serverless deep learning section?",
 'course': 'machine-learning-zoomcamp'}

# Prompt 2

In [23]:
evaluations_2 = []

for record in tqdm(samples):
    prompt = prompt2_template.format(**record)
    evaluation = llm(prompt, model='gpt-4o-mini')
    evaluations_2.append(evaluation)

  0%|          | 0/150 [00:00<?, ?it/s]

In [24]:
json_evaluations_2 = []

for i, str_eval in enumerate(evaluations_2):
    json_eval = json.loads(str_eval)
    json_evaluations_2.append(json_eval)

In [25]:
df_evaluations_2 = pd.DataFrame(json_evaluations_2)

In [26]:
df_evaluations_2[df_evaluations_2.Relevance == 'NON_RELEVANT']

Unnamed: 0,Relevance,Explanation
45,NON_RELEVANT,The generated answer does not address the ques...
102,NON_RELEVANT,The generated answer does not directly address...
139,NON_RELEVANT,The generated answer does not address the ques...


In [27]:
samples[45]

{'answer_llm': 'The provided context does not include specific commands to start the Docker daemon on Linux. Therefore, I cannot provide an answer based solely on the facts from the context.',
 'answer_orig': 'Working on getting Docker installed - when I try running hello-world I am getting the error.\r\nDocker: Cannot connect to the docker daemon at unix:///var/run/docker.sock. Is the Docker daemon running ?\r\nSolution description\r\nIf you’re getting this error on WSL, re-install your docker: remove the docker installation from WSL and install Docker Desktop on your host machine (Windows).\r\nOn Linux, start the docker daemon with either of these commands:\r\nsudo dockerd\r\nsudo service docker start\r\nAdded by Ugochukwu Onyebuchi',
 'document': '4b2a3181',
 'question': 'What commands should I use to start the docker daemon on Linux?',
 'course': 'machine-learning-zoomcamp'}

# Saving the Data

In [28]:
df_evaluations.to_csv('data/evaluations-aqa.csv', index=False)
df_evaluations_2.to_csv('data/evaluations-qa.csv', index=False)