# Configuration and Imports

NOTE: Execute the javascript code to check for errors.

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import glob
import json
import datasets
from langchain.chat_models.base import BaseChatModel

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate
)
import random
from _voyager.control_primitives_context import load_control_primitives_context
from _voyager.prompts import load_prompt
from langchain.schema import SystemMessage
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from transformers import AutoTokenizer
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import os
import re
from langchain_community.chat_models.openai import ChatOpenAI
from shared import file_utils as U
import plotly.express as px
pd.set_option("display.max_colwidth", None)



# Read the dataset 

In [6]:
def unfold_camel(camel_str):
    # Insert a space before each uppercase letter and convert the entire string to lowercase
    return re.sub(r'(?<!^)(?=[A-Z])', ' ', camel_str).lower()


def get_question(task):
    question = (
        f"How to {unfold_camel(task.replace('_', ' ').replace(' ore', '').replace(' ores', '').replace('.', '').strip())}"
        f" in Minecraft?"
    )
    return question

def remove_duplicates(dataset):
    result = []
    seen_ids = set()

    for item in dataset:
        if item['question'] not in seen_ids:
            result.append(item)
            seen_ids.add(item['question'])
    return result

def read_json_files_recursively(root_dir):
    all_data = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.json'):
                filepath = os.path.join(dirpath, filename)
                with open(filepath, 'r') as file:
                    data = json.load(file)
                    all_data.append((filepath, data))
    return all_data

json_files = read_json_files_recursively("questllama/skill_library")
eval_dataset = []
for i, trial in enumerate(json_files):
    for program_name, info in trial[1].items(): # info contains code and scription of a program
        question = get_question(program_name)
        eval_dataset.append({'question': question, 'answer': info['code'], 'description': info['description'], 'source_doc': f'{"/".join(trial[0].split("/")[:-1])}/code/{program_name}.js'})


## Read the dataset
ds = U.read_skill_library("skill_library", full_path=True)
ds = [ {'text': elem[1], 'source': elem[0] } for elem in ds]
RAW_KNOWLEDGE_BASE = [Document(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in tqdm(ds)]

print(f"RAW_KNOWLEDGE_DATABASE: {len(RAW_KNOWLEDGE_BASE)}")
print(f"Evaluation dataset prior to removing duplicates: {len(eval_dataset)}")
eval_dataset = remove_duplicates(eval_dataset)
print(f"Evaluation dataset after to removing duplicates: {len(eval_dataset)}")

  0%|          | 0/311 [00:00<?, ?it/s]

RAW_KNOWLEDGE_DATABASE: 311
Evaluation dataset prior to removing duplicates: 286
Evaluation dataset after to removing duplicates: 189


# Build a RAG System

## Embeddings

In [7]:
def _split_documents(
    chunk_size: int,
    knowledge_base: List[Document],
    tokenizer_name: Optional[str],
) -> List[Document]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    JAVASCRIPT_SEPARATORS = [
        "\nfunction ",
        "\nconst ",
        "\nlet ",
        "\nvar ",
        "\nclass ",
        "\nif ",
        "\nfor ",
        "\nwhile ",
        "\nswitch ",
        "\ncase ",
        "\ndefault ",
        "\n\n",
        "\n",
        " ",
        "",
    ]
    print(f'Tokenizer: {tokenizer_name}, Chunk Size: {chunk_size}')
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # FIXME: Is it necessary to pass the separators here? Try without them.
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=JAVASCRIPT_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique



def load_embeddings(
    langchain_docs: List[Document],
    chunk_size: int,
    embedding_model_name: Optional[str]
) -> FAISS:
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: list of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
    """
    # load embedding_model
    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True},  # set True to compute cosine similarity
    )

    # Check if embeddings already exist on disk
    index_name = f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
    index_folder_path = f"./data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
            allow_dangerous_deserialization=True
        )
    else:
        print("Index not found, generating it...")

        docs_processed = _split_documents(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

## Inference

### Prepare system and user messages

In [8]:
def read_user_messages():
    # Load the 'database' file
    answers = json.load(open("community/voyager5/openai_requests.json", "r"))

    # Get only user messages contained in action events
    user_messages = [msg['prompts'][1]['message'] for msg in answers if msg['prompts'][1]['message'].startswith('Code from the last round:')]

    # Use regular expression to replace the task description regardless of what it is
    # user_messages = [ re.sub(r"(Task:).*", "Task: {question}", msg) for msg in user_messages ]
    unique = set(user_messages)
    return list(unique)

def render_system_message(skills=[]):
    system_template = load_prompt("action_template_rag")
    # FIXME: Hardcoded control_primitives
    base_skills = [
        "exploreUntil",
        "mineBlock",
        "craftItem",
        "placeItem",
        "smeltItem",
        "killMob",
    ]
    if True: # TODO Razvan: always import these files since questllama's model context size is enough
        base_skills += [
            "useChest",
            "mineflayer",
        ]
    programs = "\n\n".join(load_control_primitives_context(base_skills) + skills)
    response_format = load_prompt("action_response_format_rag")
    system_message_prompt = SystemMessagePromptTemplate.from_template(
        system_template
    )
    system_message = system_message_prompt.format(
        programs=programs, response_format=response_format, context=""
    )

    return system_message

In [14]:
# Read the system message, this one is unique
system_message = render_system_message()

# Here are sample user messages, containing the state of the Minecraft world.
user_messages = read_user_messages()
random.seed(422)

# Below are some further examples (w/o context) to augment the size of the training set.
task_type = 'action'
default_config = U.debug_load_prompt(f"/debugging/{task_type}/user_rag.txt")

# TODO Razvan add new test samples: user_messages = [default_config] + user_messages

random.seed(42)
# This is the template
RAG_PROMPT_TEMPLATE = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_message.content),
        HumanMessagePromptTemplate.from_template(default_config)
    ]
)

RAG_PROMPT_TEMPLATE.format(context="test", question="Mine 1 wood log")
# For debugging purposes.
# TEMP = RAG_PROMPT_TEMPLATE.format(context="here should files", question='Mine 100 wood log')
# outputfile = "logs/rag_evaluation/test.txt"
# with open(outputfile, "w") as f:
#    f.write(TEMP)

# print(TEMP)


'System: System Definition:\nYou are a helpful assistant that writes Mineflayer javascript code to complete any Minecraft task specified by me.\nGenerate code strictly according to the given specifications. The code should be fully functional and ready to execute as provided, requiring no further modifications.\n\n\n---\nGuidelines:\nPlease pay close attention to the following points to avoid repeating past mistakes. The goal is to offer you information based on past mistakes, helping you avoid making them again in the future. Try to generalise whenever possible the associated example.\n\n1) Mistake: Not identifying specific items within a broad category to complete a task.\n   Concept To complete a task defined by a broad category, first understand the category and then choose any specific item within it to fulfill the requirement. \n   Example: in a task like "Mine 1 wood log," the term "wood log" is a broad category that includes specifics such as oak, birch, or spruce logs. Simply 

### RAG Methods

In [6]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[Document]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm.invoke(final_prompt)

    return answer, relevant_docs

def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm: BaseChatModel,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index, reranker=reranker, num_docs_final=num_docs_final, num_retrieved_docs=num_retrieved_docs)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer.content}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer.content,
            "retrieved_docs": [doc for doc in relevant_docs],
            "description": example["description"]
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

   
        with open(output_file, "w") as f:
            json.dump(outputs, f, indent=4)

## Evaluation

### Evaluator model

In [7]:
# EVALUATION_PROMPT = """###Task Description:
# An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
# 1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
# 2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
# 3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
# 4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

# ###The instruction to evaluate:
# {instruction}

# ###Response to evaluate:
# {response}

# ###Reference Answer (Score 5):
# {reference_answer}

# ###Score Rubrics:
# [Is the response correct, accurate, and factual based on the reference answer?]
# Score 1: The response is completely incorrect, inaccurate, and/or not factual.
# Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
# Score 3: The response is somewhat correct, accurate, and/or factual.
# Score 4: The response is mostly correct, accurate, and factual.
# Score 5: The response is completely correct, accurate, and factual.

# ###Feedback:"""

# SYNTAX_ACCURACY_PROMPT = """###Task Description:
# Will provide JavaScript code response to evaluate based on syntax accuracy. Include a reference answer that scores a 5. Focus solely on the correctness of the syntax, including variable declarations, use of operators, function definitions, and control structures. This format helps ensure that the syntax is assessed without requiring code execution.
# 1. Write detailed feedback assessing the quality of the response, focusing specifically on syntax accuracy.
# 2. Assign a score from 1 to 5 based on the rubrics after providing feedback.
# 3. Format your output as follows: "Feedback: [write detailed feedback on syntax issues] [RESULT] [score from 1 to 5]"
# 4. Exclude any additional commentary beyond the feedback and score. Ensure the inclusion of [RESULT] in your output.

# ###Response to evaluate:
# {response}

# ###Reference Answer (Score 5):
# {reference_answer}

# ###Score Rubrics:
# [Syntax Accuracy]
# Score 1: Contains multiple syntax errors, preventing execution.
# Score 2: Some syntax errors present, affects overall functionality.
# Score 3: Minor syntax errors, do not significantly impact functionality.
# Score 4: Very minor syntax inaccuracies, nearly perfect.
# Score 5: Perfectly accurate syntax, completely error-free.

# ###Feedback:"""


# """###Predefined Functions and Practices:
# The model has access to several predefined helper functions that are designed to simplify routine operations and enhance code efficiency. While these functions are available for use, their application should be contextually appropriate to the specific task being performed:
# - `mineBlock(bot, name, count)`: Collects specified blocks, suitable for tasks involving resource collection.
# - `craftItem(bot, name, count)`: Crafts items, applicable for tasks requiring item assembly.
# - `smeltItem(bot, name, count)`: Manages item smelting, relevant for tasks involving material processing.
# - `placeItem(bot, name, position)`: Places items, useful for building and construction tasks.
# - `killMob(bot, name, timeout)`: Handles mob engagement and elimination, necessary for combat-related tasks.

# It is not expected that all these functions will be used in every solution; rather, their use should be dictated by the requirements of the specific task at hand. The absence of a function in a solution where it is not relevant should not negatively impact the evaluation. The emphasis of the evaluation should be on the effective use of these functions when they are applicable to the task, ensuring they are not marked as undefined if used and not penalizing their non-use when they are not relevant to the task objectives."""

# TODO Razvan simplify the prompt
# ###Guidelines for Code Writing:
# 1. The main function must be an async function that takes `bot` as its sole argument.
# 2. Internal variables and setup must be confined within the function to ensure reusability and independence from external state changes.
# 3. Progress should be communicated through `bot.chat` to indicate milestones or status updates within the task.
# 4. Exploration and item collection must adapt to changing conditions, employing `exploreUntil` with random directions to avoid predictable patterns.
# 5. Maintain a strict operational boundary with a maximum distance of 32 blocks for finding blocks or entities.
# 6. Avoid the use of infinite loops, recursive functions, and event listeners to prevent performance degradation and unpredictable behavior.
# 7. The function's name should reflect its purpose clearly and be indicative of the task it performs.


# EVALUATION_PROMPT = """###Task Description:
# An instruction including a JavaScript task, a response to evaluate, a reference answer that scores a 5, and scoring rubrics focused on syntax accuracy and adherence to the task description. This format helps ensure the generated code aligns closely with the intended task without requiring code execution for assessment.
# 1. Write detailed feedback assessing the quality of the response, focusing on syntax accuracy and how well the response adheres to the task requirements.
# 2. Assign a score from 1 to 5 based on the rubrics after providing feedback.
# 3. Format your output as follows: "Feedback: {{write feedback for each criterion}} [RESULT] {{score from 1 to 5}}"
# 4. Exclude any additional commentary beyond the feedback and score. Ensure the inclusion of [RESULT] in your output.

# ###The instruction to evaluate:
# {instruction}

# ###Response to evaluate:
# {response}

# ###Reference Answer (Score 5):
# {reference_answer}

# ###Score Rubrics:
# [Syntax Accuracy]
# Score 1: Contains multiple syntax errors, preventing execution.
# Score 2: Some syntax errors present, affects overall functionality.
# Score 3: Minor syntax errors, do not significantly impact functionality.
# Score 4: Very minor syntax inaccuracies, nearly perfect.
# Score 5: Perfectly accurate syntax, completely error-free.

# [Task Adherence]
# Score 1: Does not address the task requirements.
# Score 2: Partially addresses the task but misses key aspects.
# Score 3: Addresses the task adequately, though some aspects could be better aligned.
# Score 4: Very closely adheres to the task with minor deviations.
# Score 5: Perfectly aligns with the task requirements, fully accomplishing the specified objectives.

# ###Feedback:"""

EVALUATION_PROMPT = """
###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)\"
4. Please do not generate any other opening, closing, and explanations.

###The Instruction to evaluate:
{instruction}

###Response to evalute:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
The model has access to the following predefined helper functions: `mineBlock`, `craftItem`, `smeltItem`, `placeItem` and `killMob` which may be used if needed.
Is the response correct, accurate, and factual based on the reference answer?
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:
"""
evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

### Evaluator Methods

In [8]:
def evaluate_answers(
    answer_path: str,
    eval_chat_model: BaseChatModel,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
       
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        print("\n===========================================")

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [item.strip() for item in eval_result.content.split("[RESULT]")]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f, indent=4)

# Run Tests

Parameters to fine-tune:
Reader models:
- models used: deepseek-coder:6.7b-instruct-q5_K_M, deepseek-coder:33b-instruct-q5_K_M
- temperature used
- model_kwargs={ "top_k":30, "repetition_penalty": 1.03 }

Splitter chunk size:
- 128 is the maximum for st-codesearch-distilroberta-base, see: SentenceTransformer('thenlper/gte-small').max_seq_length



In [1]:
if not os.path.exists("./output"):
    os.mkdir("./output")


evaluator_name = "gattipg/prometheus:13b-v1.0-Q5_K_M"
eval_chat_model = ChatOpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
    temperature=0,
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
    model=evaluator_name)

for reader_model in ["llama3:70b-instruct", "deepseek-coder:33b-instruct-q5_K_M"]: # deepseek-coder:33b-instruct-q5_K_M
    READER_LLM = ChatOpenAI(
        base_url = 'http://localhost:11434/v1',
        api_key='ollama', # required, but unused
        temperature=0.0,
        streaming=True, 
        callbacks=[StreamingStdOutCallbackHandler()],
        model=reader_model
    )
    # TODO Razvan Make sure that the prompt generated here and the prompt generated in the simulation are the same
    # TODO Razvan Test codebert retriever as well
    for num_docs_final in [3, 5]:
        for num_retrived_docs in [30]:
            for chunk_size in [128]:
                for embeddings in ["flax-sentence-embeddings/st-codesearch-distilroberta-base", "microsoft/codebert-base"]: # "thenlper/gte-small",
                    for rerank in [True, False]:
                        settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_numDocsFinal:{num_docs_final}_numRetrivedDocs:{num_retrived_docs}_reader-model:{reader_model.replace('/', '~').replace('_', '~').replace(':', '~')}_evaluator:{evaluator_name.replace('/', '~').replace('_', '~').replace(':', '~')}"
                        output_file_name = f"./output/rag_{settings_name}.json"

                        print(f"\nRunning evaluation for {settings_name}:")

                        print("Loading knowledge base embeddings...")
                        knowledge_index = load_embeddings(
                            RAW_KNOWLEDGE_BASE,
                            chunk_size=chunk_size,
                            embedding_model_name=embeddings,
                        )

                        print("Running RAG...")
                        reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0") if rerank else None
                        run_rag_tests(
                            eval_dataset=eval_dataset[:3],
                            llm=READER_LLM,
                            knowledge_index=knowledge_index,
                            output_file=output_file_name,
                            reranker=reranker,
                            verbose=True,
                            test_settings=settings_name,
                            num_retrieved_docs=num_retrived_docs,
                            num_docs_final=num_docs_final
                        )

                        print("\nRunning evaluation...")
                        evaluate_answers(
                            output_file_name,
                            eval_chat_model,
                            evaluator_name,
                            evaluation_prompt_template
                        )

KeyboardInterrupt: 

# Display results

In [26]:
# # TODO to remove
# settings_name = "chunk:128_embeddings:flax-sentence-embeddings~st-codesearch-distilroberta-base_rerank:True_numDocsFinal:3_numRetrivedDocs:30_reader-model:deepseek-coder~33b-instruct-q5~K~M_evaluator:gattipg~prometheus~13b-v1.0-Q5~K~M"
# answers = []
# answer_path = f"./output/rag_{settings_name}.json"
# if os.path.isfile(answer_path):  # load previous generations if they exist
#     answers = json.load(open(answer_path, "r"))

# for element in answers:
#     # Use the pop method to remove the field
#     element.pop('eval_score_gattipg/prometheus:13b-v1.0-Q5_K_M', None) 
#     element.pop('eval_feedback_gattipg/prometheus:13b-v1.0-Q5_K_M', None) 

# with open(answer_path, 'w') as file:
#     json.dump(answers, file, indent=4)

In [None]:
"""[The model has access to the following predefined helper functions: `mineBlock`, `craftItem`, `smeltItem`, `placeItem` and `killMob` which may be used if needed.
Is the response correct, accurate, and factual based on the reference answer?]

Score 1: The code is largely non-functional or irrelevant to the task.
Score 2: The code attempts to address the task but is incomplete or incorrect in its approach.
Score 3: The code is functional and makes a reasonable attempt at the task.
Score 4: The code meets the main objective. Minor errors don’t significantly impact the overall functionality.
Score 5: The code achieves the task efficiently and handles potential errors well.
"""

EVALUATION_PROMPT = """
###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)\"
4. Please do not generate any other opening, closing, and explanations.

###The Instruction to evaluate:
{instruction}

###Response to evalute:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
The model has access to the following predefined helper functions: `mineBlock`, `craftItem`, `smeltItem`, `placeItem` and `killMob` which may be used if needed.
Is the response correct, accurate, and factual based on the reference answer?
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:
"""
evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)
settings_name = "chunk:128_embeddings:flax-sentence-embeddings~st-codesearch-distilroberta-base_rerank:False_numDocsFinal:3_numRetrivedDocs:30_reader-model:deepseek-coder~33b-instruct-q5~K~M_evaluator:gattipg~prometheus~13b-v1.0-Q5~K~M"
answers = []
answer_path = f"./output/rag_{settings_name}.json"
if os.path.isfile(answer_path):  # load previous generations if they exist
    answers = json.load(open(answer_path, "r"))

print(len(answers))
for experiment in tqdm(answers[:1]):

    print("\n===========================================")

    eval_prompt = evaluation_prompt_template.format_messages(
        instruction=experiment["question"],
        response=experiment["generated_answer"],
        reference_answer=experiment["true_answer"],
    )
    print(eval_prompt[1].content)
    eval_result = eval_chat_model.invoke(eval_prompt)
    feedback, score = [item.strip() for item in eval_result.content.split("[RESULT]")]
    experiment[f"eval_score_{evaluator_name}"] = score
    experiment[f"eval_feedback_{evaluator_name}"] = feedback

    with open('logs/rag_evaluation/evaluator_debug.txt', 'w') as file:
        # Write some text to the file
        file.write(experiment[f"eval_feedback_{evaluator_name}"])
        file.write(f"\n{experiment[f'eval_score_{evaluator_name}']}")

48


  0%|          | 0/1 [00:00<?, ?it/s]



###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: "Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)"
4. Please do not generate any other opening, closing, and explanations.

###The Instruction to evaluate:
How to mine wood log in Minecraft?

###Response to evalute:
Explain: The code does not complete the task because it doesn't define a function to mine wood logs. It only provides helper functions for other tasks such as mining blocks, crafting items, smelting items, and placing items. 

Plan:
1) Defi

## Prepare the output

In [10]:
outputs = []
for file in glob.glob("./output/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
result = pd.concat(outputs)


result[f"eval_score_{evaluator_name}"] = result[f"eval_score_{evaluator_name}"].apply(lambda x: int(x) if isinstance(x, str) else 1)
result[f"eval_score_{evaluator_name}"] = (result[f"eval_score_{evaluator_name}"] - 1) / 4


average_scores = result.groupby("settings")[f"eval_score_{evaluator_name}"].mean()
average_scores.sort_values()
scaled_values = pd.Series(average_scores * 100)
scaled_values


settings
./output/rag_chunk:128_embeddings:flax-sentence-embeddings~st-codesearch-distilroberta-base_rerank:False_numDocsFinal:3_numRetrivedDocs:30_reader-model:deepseek-coder~33b-instruct-q5~K~M_evaluator:gattipg~prometheus~13b-v1.0-Q5~K~M.json    22.395833
./output/rag_chunk:128_embeddings:flax-sentence-embeddings~st-codesearch-distilroberta-base_rerank:True_numDocsFinal:3_numRetrivedDocs:30_reader-model:deepseek-coder~33b-instruct-q5~K~M_evaluator:gattipg~prometheus~13b-v1.0-Q5~K~M.json     25.000000
Name: eval_score_gattipg/prometheus:13b-v1.0-Q5_K_M, dtype: float64

## Show bar charts

In [11]:
# Display the bar chart
fig = px.bar(
    scaled_values,
    color=scaled_values,
    labels={
        "value": "Accuracy",
        "settings": "Configuration",
    },
    color_continuous_scale="bluered",
)
fig.update_layout(
    width=1000,
    height=600,
    barmode="group",
    yaxis_range=[0, 100],
    title="<b>Accuracy of different RAG configurations</b>",
    xaxis_title="RAG settings",
    font=dict(size=15),
)
fig.layout.yaxis.ticksuffix = "%"
fig.update_coloraxes(showscale=False)
fig.update_traces(texttemplate="%{y:.1f}", textposition="outside")
fig.show()
