# Configuration and Imports

NOTE: Execute the javascript code to check for errors.

In [17]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import glob
import json
import datasets
from langchain.chat_models.base import BaseChatModel

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate
)
import random
from _voyager.control_primitives_context import load_control_primitives_context
from _voyager.prompts import load_prompt
from langchain.schema import SystemMessage
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from transformers import AutoTokenizer
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import os
import re
from langchain_community.chat_models.openai import ChatOpenAI
from questllama.core.utils import file_utils as U
import plotly.express as px
pd.set_option("display.max_colwidth", None)

# core_retriever.py

CONFIG = {
    'embeddings': ["flax-sentence-embeddings/st-codesearch-distilroberta-base"],
    'chunk_size': 128, # larger chunk_size is not supported by this tokenizer
    'inference_client': { 
        'name': 'deepseek-coder:6.7b-instruct-q5_K_M',
        'temperature': 0.0
    },
    'judge': {
        'name':"gattipg/prometheus:13b-v1.0-Q5_K_M",
        'temperature': 0.0
    },
    'task_type': "action"
}


# Read the dataset 

In [18]:
def unfold_camel(camel_str):
    # Insert a space before each uppercase letter and convert the entire string to lowercase
    return re.sub(r'(?<!^)(?=[A-Z])', ' ', camel_str).lower()


def get_question(task):
    question = (
        f"How to {unfold_camel(task.replace('_', ' ').replace(' ore', '').replace(' ores', '').replace('.', '').strip())}"
        f" in Minecraft?"
    )
    return question

def read_json_files_recursively(root_dir):
    all_data = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.json'):
                filepath = os.path.join(dirpath, filename)
                with open(filepath, 'r') as file:
                    data = json.load(file)
                    all_data.append((filepath, data))
    return all_data

json_files = read_json_files_recursively("questllama/skill_library")
eval_dataset = []
for i, trial in enumerate(json_files):
    for program_name, info in trial[1].items(): # info contains code and scription of a program
        question = get_question(program_name)
        eval_dataset.append({'question': question, 'answer': info['code'], 'description': info['description'], 'source_doc': f'{"/".join(trial[0].split("/")[:-1])}/code/{program_name}.js'})


## Read the dataset
ds = U.read_skill_library("skill_library", full_path=True)
ds = [ {'text': elem[1], 'source': elem[0] } for elem in ds]
RAW_KNOWLEDGE_BASE = [Document(page_content=doc["text"], metadata={"source": doc["source"]}) for doc in tqdm(ds)]



  0%|          | 0/311 [00:00<?, ?it/s]

In [19]:

questions = set([ item['question'] for item in eval_dataset ])

result = []
seen_ids = set()

for item in eval_dataset:
    if item['question'] not in seen_ids:
        result.append(item)
        seen_ids.add(item['question'])

eval_dataset = result

# Build a RAG System

## Embeddings

In [20]:
def _split_documents(
    chunk_size: int,
    knowledge_base: List[Document],
    tokenizer_name: Optional[str],
) -> List[Document]:
    """
    Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
    """
    JAVASCRIPT_SEPARATORS = [
        "\nfunction ",
        "\nconst ",
        "\nlet ",
        "\nvar ",
        "\nclass ",
        "\nif ",
        "\nfor ",
        "\nwhile ",
        "\nswitch ",
        "\ncase ",
        "\ndefault ",
        "\n\n",
        "\n",
        " ",
        "",
    ]
    print(f'Tokenizer: {tokenizer_name}, Chunk Size: {chunk_size}')
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # FIXME: Is it necessary to pass the separators here? Try without them.
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer,
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=JAVASCRIPT_SEPARATORS,
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique



def load_embeddings(
    langchain_docs: List[Document],
    chunk_size: int,
    embedding_model_name: Optional[str] = "thenlper/gte-small",
) -> FAISS:
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: list of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
    """
    # load embedding_model
    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={"normalize_embeddings": True},  # set True to compute cosine similarity
    )

    # Check if embeddings already exist on disk
    index_name = f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
    index_folder_path = f"./data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
            allow_dangerous_deserialization=True
        )
    else:
        print("Index not found, generating it...")

        docs_processed = _split_documents(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

## Inference
### Model definition

In [21]:
READER_LLM = ChatOpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
    temperature=CONFIG['inference_client']['temperature'],
    streaming=False, 
    callbacks=[StreamingStdOutCallbackHandler()],
    model=CONFIG['inference_client']['name'],
    # model_kwargs={
    #     "max_new_tokens": 512,
    #     "top_k": 30,
    #     "repetition_penalty": 1.03,
    # },
)

### Prepare system and user messages

In [29]:
def read_user_messages():
    # Load the 'database' file
    answers = json.load(open("community/voyager5/openai_requests.json", "r"))

    # Get only user messages contained in action events
    user_messages = [msg['prompts'][1]['message'] for msg in answers if msg['prompts'][1]['message'].startswith('Code from the last round:')]

    # Use regular expression to replace the task description regardless of what it is
    # user_messages = [ re.sub(r"(Task:).*", "Task: {question}", msg) for msg in user_messages ]
    unique = set(user_messages)
    return list(unique)

def render_system_message(skills=[]):
    system_template = load_prompt("action_template_rag")
    # FIXME: Hardcoded control_primitives
    base_skills = [
        "exploreUntil",
        "mineBlock",
        "craftItem",
        "placeItem",
        "smeltItem",
        "killMob",
    ]
    if True: # NOTE: always import these files since questllama's model context size is enough
        base_skills += [
            "useChest",
            "mineflayer",
        ]
    programs = "\n\n".join(load_control_primitives_context(base_skills) + skills)
    response_format = load_prompt("action_response_format_rag")
    system_message_prompt = SystemMessagePromptTemplate.from_template(
        system_template
    )
    system_message = system_message_prompt.format(
        programs=programs, response_format=response_format, context=""
    )

    return system_message

In [52]:
# Read the system message, this one is unique
system_message = render_system_message()

# Here are sample user messages, containing the state of the Minecraft world.
user_messages = read_user_messages()
random.seed(422)

# Below are some further examples (w/o context) to augment the size of the training set.
default_config = U.debug_load_prompt("/debugging/" + CONFIG['task_type'] + "/user_rag.txt")

# NOTE add new test samples: user_messages = [default_config] + user_messages

random.seed(42)
# This is the template
RAG_PROMPT_TEMPLATE = ChatPromptTemplate.from_messages(
    [
        SystemMessagePromptTemplate.from_template(system_message.content),
        HumanMessagePromptTemplate.from_template(default_config)
    ]
)

# For debugging purposes.
# RAG_PROMPT_TEMPLATE = RAG_PROMPT_TEMPLATE.format(context="here should files", question='Mine 100 wood log')
# outputfile = "logs/rag_evaluation/test.txt"
# with open(outputfile, "w") as f:
#    f.write(RAG_PROMPT_TEMPLATE)


### RAG Methods

In [53]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[Document]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm.invoke(final_prompt)

    return answer, relevant_docs

def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm: BaseChatModel,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index, reranker=reranker)
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer.content}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer.content,
            "retrieved_docs": [doc for doc in relevant_docs],
            "description": example["description"]
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

   
        with open(output_file, "w") as f:
            json.dump(outputs, f, indent=4)

## Evaluation

### Evaluator model

In [54]:
# EVALUATION_PROMPT = """###Task Description:
# An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
# 1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
# 2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
# 3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
# 4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

# ###The instruction to evaluate:
# {instruction}

# ###Response to evaluate:
# {response}

# ###Reference Answer (Score 5):
# {reference_answer}

# ###Score Rubrics:
# [Is the response correct, accurate, and factual based on the reference answer?]
# Score 1: The response is completely incorrect, inaccurate, and/or not factual.
# Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
# Score 3: The response is somewhat correct, accurate, and/or factual.
# Score 4: The response is mostly correct, accurate, and factual.
# Score 5: The response is completely correct, accurate, and factual.

# ###Feedback:"""

EVALUATION_PROMPT = """###Task Description:
Provide an instruction including a JavaScript task, a response to evaluate, a reference answer that scores a 5, and scoring rubrics focused on syntax accuracy and adherence to the task description. This format helps ensure the generated code aligns closely with the intended task without requiring code execution for assessment.
1. Write detailed feedback assessing the quality of the response, focusing on syntax accuracy and how well the response adheres to the task requirements.
2. Assign a score from 1 to 5 based on the rubrics after providing feedback.
3. Format your output as follows: "Feedback: {{write feedback for each criterion}} [RESULT] {{score from 1 to 5}}"
4. Exclude any additional commentary beyond the feedback and score. Ensure the inclusion of [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Syntax Accuracy]
Score 1: Contains multiple syntax errors, preventing execution.
Score 2: Some syntax errors present, affects overall functionality.
Score 3: Minor syntax errors, do not significantly impact functionality.
Score 4: Very minor syntax inaccuracies, nearly perfect.
Score 5: Perfectly accurate syntax, completely error-free.

[Task Adherence]
Score 1: Does not address the task requirements.
Score 2: Partially addresses the task but misses key aspects.
Score 3: Addresses the task adequately, though some aspects could be better aligned.
Score 4: Very closely adheres to the task with minor deviations.
Score 5: Perfectly aligns with the task requirements, fully accomplishing the specified objectives.

###Feedback:"""

evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)

evaluator_name = CONFIG['judge']['name']
eval_chat_model = ChatOpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
    temperature=CONFIG['judge']['temperature'],
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
    model=evaluator_name)

### Evaluator Methods

In [55]:
def evaluate_answers(
    answer_path: str,
    eval_chat_model: BaseChatModel,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [item.strip() for item in eval_result.content.split("[RESULT]")]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f, indent=4)

# Run Tests

In [66]:
if not os.path.exists("./output"):
    os.mkdir("./output")

for chunk_size in [128]:  # Add other chunk sizes (in tokens) as needed
    for embeddings in ["flax-sentence-embeddings/st-codesearch-distilroberta-base"]:  # Add other embeddings as needed
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{CONFIG['inference_client']['name']}"
            output_file_name = f"./output/rag_{settings_name}.json"

            print(f"\nRunning evaluation for {settings_name}:")

            print("Loading knowledge base embeddings...")
            knowledge_index = load_embeddings(
                RAW_KNOWLEDGE_BASE,
                chunk_size=chunk_size,
                embedding_model_name=embeddings,
            )

            print("Running RAG...")
            reranker = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0") if rerank else None
            run_rag_tests(
                eval_dataset=eval_dataset[:2],
                llm=READER_LLM,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=True,
                test_settings=settings_name,
            )

            print("\nRunning evaluation...")
            evaluate_answers(
                output_file_name,
                eval_chat_model,
                evaluator_name,
                evaluation_prompt_template,
            )


Running evaluation for chunk:128_embeddings:flax-sentence-embeddings~st-codesearch-distilroberta-base_rerank:True_reader-model:deepseek-coder:6.7b-instruct-q5_K_M:
Loading knowledge base embeddings...
Running RAG...


  0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00, 80.09it/s]


System: System Definition:
You are a helpful assistant that writes Mineflayer javascript code to complete any Minecraft task specified by me.
Generate code strictly according to the given specifications. The code should be fully functional and ready to execute as provided, requiring no further modifications.


---
Guidelines:
Please pay close attention to the following points to avoid repeating past mistakes. The goal is to offer you information based on past mistakes, helping you avoid making them again in the future. Try to generalise whenever possible the associated example.

1) Mistake: Not identifying specific items within a broad category to complete a task.
   Concept To complete a task defined by a broad category, first understand the category and then choose any specific item within it to fulfill the requirement. 
   Example: in a task like "Mine 1 wood log," the term "wood log" is a broad category that includes specifics such as oak, birch, or spruce logs. Simply mine any one

100%|██████████| 1/1 [00:00<00:00, 99.69it/s]


System: System Definition:
You are a helpful assistant that writes Mineflayer javascript code to complete any Minecraft task specified by me.
Generate code strictly according to the given specifications. The code should be fully functional and ready to execute as provided, requiring no further modifications.


---
Guidelines:
Please pay close attention to the following points to avoid repeating past mistakes. The goal is to offer you information based on past mistakes, helping you avoid making them again in the future. Try to generalise whenever possible the associated example.

1) Mistake: Not identifying specific items within a broad category to complete a task.
   Concept To complete a task defined by a broad category, first understand the category and then choose any specific item within it to fulfill the requirement. 
   Example: in a task like "Mine 1 wood log," the term "wood log" is a broad category that includes specifics such as oak, birch, or spruce logs. Simply mine any one

  0%|          | 0/2 [00:00<?, ?it/s]


The response is well-structured and provides a clear explanation of how to mine wood logs in Minecraft using JavaScript. The plan outlines the steps needed to complete the task, which demonstrates a good understanding of the task requirements. However, there are minor syntax errors present in the code that do not significantly impact functionality. For example, the `bot.findBlocks` function should be called with 'oak_log' instead of `${woodType}_log`. This is a minor error and does not prevent the code from running correctly. The response also includes an explanation of how to check if we have enough sticks in our inventory before continuing to mine more logs, which aligns well with the task requirements. So the overall score is 4. [RESULT] 4
The response does not meet the task requirements as it is a JavaScript code for Minecraft, while the instruction was about crafting a table in Minecraft. The syntax of the code is also incorrect and contains multiple errors that prevent execution

  0%|          | 0/2 [00:00<?, ?it/s]

System: System Definition:
You are a helpful assistant that writes Mineflayer javascript code to complete any Minecraft task specified by me.
Generate code strictly according to the given specifications. The code should be fully functional and ready to execute as provided, requiring no further modifications.


---
Guidelines:
Please pay close attention to the following points to avoid repeating past mistakes. The goal is to offer you information based on past mistakes, helping you avoid making them again in the future. Try to generalise whenever possible the associated example.

1) Mistake: Not identifying specific items within a broad category to complete a task.
   Concept To complete a task defined by a broad category, first understand the category and then choose any specific item within it to fulfill the requirement. 
   Example: in a task like "Mine 1 wood log," the term "wood log" is a broad category that includes specifics such as oak, birch, or spruce logs. Simply mine any one

  0%|          | 0/2 [00:00<?, ?it/s]


The response is well-structured and provides a clear explanation of how to mine wood logs in Minecraft using JavaScript. The code provided is accurate and adheres closely to the task requirements. However, there are minor syntax errors that could be improved for better readability. For example, the `bot.findBlocks` function should have been defined as `bot.findBlock({matching: logType, maxDistance: 32})`, and the `mineBlock` function should have been defined as `bot.mineBlock(logType)`. These minor inaccuracies do not significantly impact the functionality of the code but could be improved for better readability. The response also provides a clear plan of action to mine wood logs, which aligns well with the task requirements. So the overall score is 4. [RESULT] 4
The response does not meet the task requirements as it is a JavaScript code for Minecraft bot, while the instruction was about crafting a table in Minecraft game. The syntax of the code is also incorrect and contains multiple

# Display results

## Prepare the output

In [64]:
def name(idx: str):
    if 'rerank:True' in idx and CONFIG['inference_client']['name'] in idx:
        return 'Deepseek Rerank'
    elif 'rerank:False' in idx and CONFIG['inference_client']['name'] in idx:
        return 'Deepseek Basic'
    else:
        raise Exception("")

outputs = []
for file in glob.glob("./output/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = name(file)
    outputs.append(output)
result = pd.concat(outputs)


result[f"eval_score_{evaluator_name}"] = result[f"eval_score_{evaluator_name}"].apply(lambda x: int(x) if isinstance(x, str) else 1)
result[f"eval_score_{evaluator_name}"] = (result[f"eval_score_{evaluator_name}"] - 1) / 4


average_scores = result.groupby("settings")[f"eval_score_{evaluator_name}"].mean()
average_scores.sort_values()
scaled_values = pd.Series(average_scores * 100)
scaled_values


settings
Deepseek Basic     38.888889
Deepseek Rerank    37.500000
Name: eval_score_gattipg/prometheus:13b-v1.0-Q5_K_M, dtype: float64

## Show bar charts

In [65]:


# Display the bar chart
fig = px.bar(
    scaled_values,
    color=scaled_values,
    labels={
        "value": "Accuracy",
        "settings": "Configuration",
    },
    color_continuous_scale="bluered",
)
fig.update_layout(
    width=1000,
    height=600,
    barmode="group",
    yaxis_range=[0, 100],
    title="<b>Accuracy of different RAG configurations</b>",
    xaxis_title="RAG settings",
    font=dict(size=15),
)
fig.layout.yaxis.ticksuffix = "%"
fig.update_coloraxes(showscale=False)
fig.update_traces(texttemplate="%{y:.1f}", textposition="outside")
fig.show()
