In [1]:
from datasets import Dataset
import os
from ragas import evaluate
from ragas.metrics import faithfulness,context_precision,answer_relevancy,context_recall,context_utilization, answer_correctness
from langchain_community.chat_models import ChatAnyscale
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from utils.prompt_template_utils import get_prompt_template
import numpy as np


ANYSCALE_API_KEY = ""
os.environ["ANYSCALE_API_BASE"] = "https://api.endpoints.anyscale.com/v1"
os.environ["ANYSCALE_API_KEY"] = ANYSCALE_API_KEY 
OPENAI_API_KEY = ""
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

  from .autonotebook import tqdm as notebook_tqdm


Create LLama2 chatbot for evaluation, GPT model for ground_truth generation

In [2]:
final_embeddings = np.load(f"instructor_embeddings_large.npy", allow_pickle=True)
vectorstore = final_embeddings.item()

template = """You are a software engineer answering to a senior software engineer who is testing your understaing of the code provided, you will use the provided knowledge to answer questions about the code. Think step by step and respond appropriately. If you can not answer a question based on 
the provided context, inform the user. Do not use any other prior information for answering. Give fully explained answers using the terms used in the code itself. A bulleted format is preferred but not necessary. Do not narrate the conversation.
"""


# ANYSCALE_MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
ANYSCALE_MODEL_NAME = "codellama/CodeLlama-13b-Instruct-hf"
LLM = ChatAnyscale(model_name = ANYSCALE_MODEL_NAME)
LLM2 = ChatOpenAI(model_name="gpt-3.5-turbo")
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})

prompt, memory = get_prompt_template(system_prompt=template, promptTemplate_type="llama", history=False)

qa_llama = RetrievalQA.from_chain_type(
    llm=LLM,
    chain_type="stuff",  # try other chains types as well. refine, map_reduce, map_rerank
    retriever=retriever,
    return_source_documents=True, # verbose=True,
    chain_type_kwargs={"prompt": prompt, "memory": memory},
    )

qa_gpt = RetrievalQA.from_chain_type(
    llm=LLM2,
    chain_type="stuff",  # try other chains types as well. refine, map_reduce, map_rerank
    retriever=retriever,
    return_source_documents=True, # verbose=True,
    chain_type_kwargs={"prompt": prompt, "memory": memory},
    )

ValidationError: 1 validation error for ChatAnyscale
__root__
  Model name codellama/CodeLlama-13b-Instruct-hf not found in available models: {'mlabonne/NeuralHermes-2.5-Mistral-7B', 'codellama/CodeLlama-70b-Instruct-hf', 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'google/gemma-7b-it', 'meta-llama/Llama-2-7b-chat-hf', 'meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-70b-chat-hf', 'meta-llama/Llama-2-7b-chat-hf:algoanalytics:nS5JS3G', 'BAAI/bge-large-en-v1.5', 'thenlper/gte-large', 'mistralai/Mistral-7B-Instruct-v0.1'}. (type=value_error)

Creating the dataset for RAGAS implementation

In [None]:
questions = []
contexts = []
file = open("model\sample_questions.txt", 'r')
for line in file:
    questions.append(line.strip())
file.close()

for ques in questions:
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(ques)])

To regenerate ground truths using GPT 3.5

In [None]:
ground_truths = ["- Bubble sort is a simple sorting algorithm that repeatedly steps through the list to be sorted, compares each pair of adjacent items and swaps them if they are in the wrong order.\n- In this case, the function bubble_sort() is being called with the argument 'elements' which is presumably a list of elements to be sorted.\n- The function also takes an optional argument 'key' which specifies the key to use for sorting the elements. In this case, the key is 'transaction_amount'.\n- The sorted list of elements will be printed out after the sorting process is completed.", '- The @time_it wrapper is a decorator function that calculates the time taken for a function to execute.\n- It takes a function as input and returns a wrapper function.\n- Inside the wrapper function, it records the start time before calling the original function, then records the end time after the function has executed.\n- It calculates the time taken for the function to execute by subtracting the start time from the end time and multiplying by 1000 to get the result in milliseconds.\n- Finally, it prints out the name of the function and the time taken in milliseconds.\n- The wrapper function then returns the result of the original function.', '- The `partition()` function takes in three parameters: `elements`, `start`, and `end`.\n- It initializes the `pivot` variable to be the last element in the `elements` list.\n- It also initializes the `p_index` variable to the value of `start`.\n- It then iterates through the elements of the list from index `start` to `end - 1`.\n- For each element, if the element is less than or equal to the `pivot`, it calls the `swap()` function passing the current index `i`, the `p_index`, and the `elements` list as arguments. This is done to move the smaller elements to the left side of the pivot.\n- After the loop, it swaps the `p_index` element with the `end` element to place the pivot in its correct sorted position.\n- Finally, it returns the `p_index`, which represents the index where the pivot element is placed in the sorted list.', '- BFS stands for Breadth First Search, while DFS stands for Depth First Search.\n- In BFS, the nodes are visited level by level starting from the root node, while in DFS, the nodes are visited depth-wise until reaching the leaf nodes.\n- BFS uses a queue data structure to keep track of the nodes to be visited, while DFS uses a stack or recursion to keep track of nodes.\n- BFS is better suited for finding the shortest path between two nodes in an unweighted graph, while DFS is more appropriate for topological sorting, cycle detection, and traversal of connected components.', '- The `fib()` function calculates and returns the nth Fibonacci number.\n- The Fibonacci sequence starts with 0 and 1, and each subsequent number is the sum of the two preceding numbers.\n- So, for example, if you call `fib(10)`, it will return the 10th Fibonacci number in the sequence.', '- The code defines a list of lists called "tests" containing various integer elements lists.\n- It then iterates over each list in the "tests" list using a for loop.\n- Within the loop, it calls the function "quick_sort" with the current list, the starting index 0, and the ending index which is the length of the list minus 1.\n- The "quick_sort" function sorts the elements in the list using the Quick Sort algorithm.\n- After sorting, it prints out the sorted array for each list in the "tests" list.\n\nOverall, the code sorts each list of integers in the "tests" list using the Quick Sort algorithm and prints out the sorted arrays.', '- The variable "pivot" in the function partition() stores the last element in the list "elements" which is used as the pivot for the partitioning process.\n- It is compared to other elements in the list to determine whether they should be placed before or after the pivot element.\n- The elements smaller than or equal to the pivot are placed before it, and the elements greater than the pivot are placed after it.', '- The `binary_search()` function is an iterative implementation of the binary search algorithm, while the `binary_search_recursive()` function is a recursive implementation of the binary search algorithm.\n- In the iterative `binary_search()` function, the algorithm uses a while loop to repeatedly divide the list into smaller parts and compare the middle element with the target value until the target value is found or the list is exhausted.\n- In the recursive `binary_search_recursive()` function, the algorithm calls itself with updated parameters for the left and right indices until the target value is found or the base case is reached.\n- Both functions follow the same binary search algorithm logic, but the difference lies in their implementation approach – iterative vs. recursive.', '- The code defines a function called merge_sort that takes an array as input.\n- Inside the merge_sort function, it checks if the length of the array is less than or equal to 1, in which case it returns the array as is.\n- If the length of the array is greater than 1, it calculates the middle index of the array and splits the array into two halves.\n- It then recursively calls the merge_sort function on the two halves of the array.\n- After the recursive calls, it returns the result of merging the two sorted halves using the merge_two_sorted_lists function.\n\nExample:\n- Given array: [10, 3, 15, 7, 8, 23, 98, 29]\n- First, the array is split into [10, 3, 15, 7] and [8, 23, 98, 29].\n- Then, the two halves are further split into [10, 3] and [15, 7] and [8, 23] and [98, 29].\n- This process continues until each subarray has only one element.\n- The merging process starts by merging [10] and [3] to get [3, 10].\n- Then [15] and [7] are merged to get [7, 15].\n- The final merge step combines [3, 10] and [7, 15] to get [3, 7, 10, 15].\n- This process is repeated for the second half of the array.\n- Finally, the two sorted halves [3, 7, 10, 15] and [8, 23, 29, 98] are merged to get the fully sorted array [3, 7, 8, 10, 15, 23, 29, 98].\n- The sorted array is then printed.', "- The shell_sort() function can be optimized by changing the gap sequence used for sorting.\n- One way to optimize it is to use the Knuth's sequence for determining the gap values.\n- Knuth's sequence is defined as h = (3 * h) + 1 where h is the starting value.\n- By using Knuth's sequence, the shell_sort() function can achieve better performance by reducing the number of comparisons and swaps required."]

To regenerate answers from LLAMA2

In [None]:
answers = []
for ques in questions:
    answers.append(qa_llama.invoke(ques)['result'])

Compiling dataset into dictionary for RAGAS

In [None]:
data = {"question": questions,
        "answer": answers,
        "contexts": contexts,
        "ground_truth": ground_truths}

dataset = Dataset.from_dict(data)

ragas_results = evaluate(
    dataset = dataset,
    metrics=[
        context_utilization,
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
        answer_correctness
    ]
)

df = ragas_results.to_pandas()

In [None]:
print(df)
df.to_csv("ragas_evauation_codellama_bgelarge.csv")