### PLEASE PASTE YOUR OWN API KEY HERE

In [3]:
import os

os.environ["OPENAI_API_KEY"] = ""


In [3]:
import ray

# Credentials
ray.init(runtime_env={
    "env_vars": {
        "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    },
})

2023-12-05 09:21:20,423	INFO worker.py:1642 -- Started a local Ray instance.


0,1
Python version:,3.11.5
Ray version:,2.7.0


In [1]:
import nest_asyncio

nest_asyncio.apply()

### Sentence Window Retrieval (Window size = 1, Top_K = 20, model = mistral)

In [4]:
BASE_MODEL = 'sentence-transformers/all-mpnet-base-v2'  
WINDOW_SIZE = 1
TOP_K = 20

TEST_SUBSAMPLE_N = 10

EXPERIMENT_NAME = 'sentence_window'

In [5]:
import json
from llama_index.schema import Document

def read_json(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

def to_doc(entry_dict):
    return Document(text=entry_dict['text'], metadata={'drug_link': entry_dict['source']}) 

def load_corpus(filename):
    sections = read_json(filename)
    docs = [to_doc(dict_) for dict_ in sections]
    return docs

In [6]:
docs = load_corpus('../datasets/eval_qst.json')

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import ServiceContext, set_global_service_context, VectorStoreIndex
from llama_index.llms import OpenAI
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

# create the sentence window node parser
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=WINDOW_SIZE,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

service_context = ServiceContext.from_defaults(
    embed_model=HuggingFaceEmbeddings(
        model_name=BASE_MODEL
    ),
    node_parser=node_parser,
)

index = VectorStoreIndex.from_documents(
    docs, 
    service_context=service_context, 
    show_progress=True
)

Parsing documents into nodes:   0%|          | 0/2966 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/17571 [00:00<?, ?it/s]

In [8]:
retriever = index.as_retriever(similarity_top_k=TOP_K)

In [10]:
query_engine = index.as_query_engine(
    similarity_top_k=TOP_K,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

#### Evaluating retrieval

In [11]:
import json
import numpy as np

from eval import evaluate_retrieval

2023-12-04 09:32:37.920165: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-04 09:32:37.920249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-04 09:32:37.921509: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-04 09:32:37.928283: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
from pathlib import Path
golden_dataset_path = Path("../datasets/eval_qst.json")
with open(golden_dataset_path, "r") as f:
    golden_dataset = json.load(f)

queries = [item['question'] for item in golden_dataset]
golden_sources = [item['source'] for item in golden_dataset]

In [37]:
results = evaluate_retrieval(retriever, queries, golden_sources)

100%|███████████████████████████████████████████████████████████████████████████████| 2966/2966 [23:04<00:00,  2.14it/s]


In [39]:
from eval import evaluate_retrieval, get_hit_rate, get_mean_score
hit_rate = get_hit_rate(results)
hit_rate

0.9463924477410655

#### Evaluating RAG

In [13]:
with open("../datasets/golden-responses.json", "r") as file:
    golden_responses = json.load(file)
golden_responses[0]

{'question': 'What are the side effects of doxycycline?',
 'source': 'https://www.drugs.com/doxycycline.html',
 'response': 'The side effects of doxycycline may include severe stomach pain, diarrhea that is watery or bloody, throat irritation, trouble swallowing, chest pain, irregular heart rhythm, feeling short of breath, little or no urination, low white blood cell counts, severe headaches, ringing in your ears, dizziness, nausea, vision problems, pain behind your eyes, loss of appetite, upper stomach pain, tiredness, nausea or vomiting, fast heart rate, dark urine, and jaundice. Common side effects may include nausea and vomiting, upset stomach, loss of appetite, mild diarrhea, skin rash or itching, darkened skin color, and vaginal itching or discharge.'}

In [15]:
import os
from llama_index.llms import Replicate
os.environ["REPLICATE_API_TOKEN"]="r8_a0d6dQeqgpR2DEpik21hoyocmFx0eBI01cm6w"
llm = Replicate(
    model="mistralai/mistral-7b-v0.1:3e8a0fb6d7812ce30701ba597e5080689bef8a013e5c6a724fafb108cc2426a0"
)
query_engine = index.as_query_engine(
    similarity_top_k=TOP_K,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
    llm=llm
)

In [16]:
rag_responses = []
rag_response_str = []
from tqdm import tqdm
for entry in tqdm(golden_responses):
    query = entry["question"]
    response = query_engine.query(query)
    rag_responses.append(response)
    rag_response_str.append(response.response)

100%|███████████████████████████████████████████████████████████████████████████████████| 10/10 [02:47<00:00, 16.71s/it]


#### Using the LLamaIndex Correctness Evaluator on Golden Responses Dataset

In [17]:
from llama_index.evaluation import CorrectnessEvaluator

In [18]:
from llama_index import VectorStoreIndex, ServiceContext
eval_llm = PaLM(api_key=palm_api_key, temperature=0.0)
service_context = ServiceContext.from_defaults(llm=eval_llm)
evaluator = CorrectnessEvaluator(service_context=service_context)

In [None]:
eval_results = []
for rag_response, golden_response in tqdm(list(zip(rag_response_str, golden_responses))):
    query = golden_response["question"]
    golden_answer = golden_response["response"]
    generated_answer = rag_response
    
    eval_result = evaluator.evaluate(query=query, reference=golden_answer, response=generated_answer)
    eval_results.append(eval_result)

In [20]:
[r.score for r in eval_results]

[4.5, 3.5, 3.0, 4.0, 3.5, 3.5, 4.0, 4.0, 3.0, 4.5]

In [21]:
scores = [
    {"question": golden_response["question"],
     "golden_response": golden_response["response"],
     "generated_response": eval_result.response,
     "score": eval_result.score,
     "reasoning": eval_result.feedback,
    }
    for eval_result, golden_response in zip(eval_results, golden_responses)
]

In [22]:
with open("sentence-window-eval-scores-mistral-exp1.json", "w") as file:
    json.dump(scores, file, indent=4)

In [23]:
average_scores = sum(score["score"] for score in scores) / len(scores)
average_scores

3.75

#### Using LlamaIndex Correctness Evaluator on User Responses Dataset

In [None]:
import json
with open("../datasets/sentence-window-eval-scores-mistral-exp1.json", "r") as file:
    pred_responses = json.load(file)

In [None]:
eval_results = []
from tqdm import tqdm
for pred_response, golden_response in tqdm(list(zip(pred_responses, golden_responses))):
    query = golden_response["question"]
    golden_answer = golden_response["response"]
    response = pred_response["generated_response"]
    
    eval_result = evaluator.evaluate(query=query, reference=golden_answer, response=response)
    eval_results.append(eval_result)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:27<00:00, 14.78s/it]


In [None]:
[r.score for r in eval_results]

[3.5, 4.0, 3.5, 3.0, 3.5, 3.5, 4.5, 3.5, 3.5, 4.5]

In [None]:
scores = [
    {"question": golden_response["question"],
     "golden_response": golden_response["response"],
     "generated_response": eval_result.response,
     "score": eval_result.score,
     "reasoning": eval_result.feedback,
    }
    for eval_result, golden_response in zip(eval_results, golden_responses)
]
with open("mistralsw1vshuman.json", "w") as file:
    json.dump(scores, file, indent=4)
average_scores = sum(score["score"] for score in scores) / len(scores)
average_scores

3.7

#### Industry Metrics on User Responses Dataset

In [2]:
import json
with open("../datasets/human1_responses.json", "r") as file:
    human1 = json.load(file)
with open("../datasets/human2_responses.json", "r") as file:
    human2 = json.load(file)
with open("../datasets/human3_responses.json", "r") as file:
    human3 = json.load(file)
with open("../datasets/sentence-window-eval-scores-mistral-exp1.json", "r") as file:
    eval = json.load(file)

human1_responses = []
human2_responses = []
human3_responses = []
eval_responses = []

for i in range(0, 10):
    human1_responses.append(human1[i]["response"])
    human2_responses.append(human2[i]["response"])
    human3_responses.append(human3[i]["response"])
    eval_responses.append(eval[i]["generated_response"])
    
references_dict = {
    "human_1": human1_responses,
    "human_2": human2_responses,
    "human_3": human3_responses,
}

In [4]:
from eval import generate_human_eval_summary
exp1_vs_humans_result = generate_human_eval_summary(references_dict, eval_responses, "Mistral Sentence Window RAG (Exp. 1)")

Calculating ROUGE Score...
Calculating BLEU Score...
Calculating BERT Score...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating METEOR Score...


[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /

In [5]:
print(exp1_vs_humans_result["rouge"])

  Mistral Sentence Window RAG (Exp. 1)    rouge1    rouge2    rougeL  \
0                              human_1  0.584217  0.406543  0.418048   
1                              human_2  0.581597  0.388147  0.411147   
2                              human_3  0.577495  0.439533  0.459834   

   rougeLsum  
0   0.415083  
1   0.408499  
2   0.458249  


In [6]:
print(exp1_vs_humans_result["bleu"])

  Mistral Sentence Window RAG (Exp. 1)      bleu
0                              human_1  0.317673
1                              human_2  0.317673
2                              human_3  0.317673


In [7]:
print(exp1_vs_humans_result["meteor"])

  Mistral Sentence Window RAG (Exp. 1)    meteor
0                              human_1  0.466855
1                              human_2  0.466855
2                              human_3  0.466855


### Sentence Window Retrieval (Window size = 1, Top_K = 10, model = mistral)

In [24]:
BASE_MODEL = 'sentence-transformers/all-mpnet-base-v2'  
WINDOW_SIZE = 1
TOP_K = 10

TEST_SUBSAMPLE_N = 10

EXPERIMENT_NAME = 'sentence_window'

In [25]:
docs = load_corpus('../datasets/eval_qst.json')

In [26]:
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import ServiceContext, set_global_service_context, VectorStoreIndex
from llama_index.llms import OpenAI
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

# create the sentence window node parser
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=WINDOW_SIZE,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

service_context = ServiceContext.from_defaults(
    embed_model=HuggingFaceEmbeddings(
        model_name=BASE_MODEL
    ),
    node_parser=node_parser,
)

index = VectorStoreIndex.from_documents(
    docs, 
    service_context=service_context, 
    show_progress=True
)

Parsing documents into nodes:   0%|          | 0/2966 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/17571 [00:00<?, ?it/s]

In [27]:
retriever = index.as_retriever(similarity_top_k=TOP_K)

In [28]:
query_engine = index.as_query_engine(
    similarity_top_k=TOP_K,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [29]:
import json
import numpy as np

from eval import evaluate_retrieval

In [30]:
from pathlib import Path
golden_dataset_path = Path("../datasets/eval_qst.json")
with open(golden_dataset_path, "r") as f:
    golden_dataset = json.load(f)

queries = [item['question'] for item in golden_dataset]
golden_sources = [item['source'] for item in golden_dataset]

In [None]:
results = evaluate_retrieval(retriever, queries, golden_sources)

100%|███████████████████████████████████████████████████████████████████████████████| 2966/2966 [23:20<00:00,  2.12it/s]


In [None]:
from eval import evaluate_retrieval, get_hit_rate, get_mean_score
hit_rate = get_hit_rate(results)
hit_rate

0.9440323668240054

#### Evaluating RAG

In [31]:
with open("../datasets/golden-responses.json", "r") as file:
    golden_responses = json.load(file)
golden_responses[0]

{'question': 'What are the side effects of doxycycline?',
 'source': 'https://www.drugs.com/doxycycline.html',
 'response': 'The side effects of doxycycline may include severe stomach pain, diarrhea that is watery or bloody, throat irritation, trouble swallowing, chest pain, irregular heart rhythm, feeling short of breath, little or no urination, low white blood cell counts, severe headaches, ringing in your ears, dizziness, nausea, vision problems, pain behind your eyes, loss of appetite, upper stomach pain, tiredness, nausea or vomiting, fast heart rate, dark urine, and jaundice. Common side effects may include nausea and vomiting, upset stomach, loss of appetite, mild diarrhea, skin rash or itching, darkened skin color, and vaginal itching or discharge.'}

In [32]:
import os
from llama_index.llms import Replicate
os.environ["REPLICATE_API_TOKEN"]="r8_a0d6dQeqgpR2DEpik21hoyocmFx0eBI01cm6w"
llm = Replicate(
    model="mistralai/mistral-7b-v0.1:3e8a0fb6d7812ce30701ba597e5080689bef8a013e5c6a724fafb108cc2426a0"
)
query_engine = index.as_query_engine(
    similarity_top_k=TOP_K,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
    llm=llm
)

In [None]:
rag_responses = []
rag_response_str = []
from tqdm import tqdm
for entry in tqdm(golden_responses):
    query = entry["question"]
    response = query_engine.query(query)
    rag_responses.append(response)
    rag_response_str.append(response.response)

#### Using the LLamaIndex Correctness Evaluator on Golden Responses Dataset

In [None]:
eval_results = []
for rag_response, golden_response in tqdm(list(zip(rag_response_str, golden_responses))):
    query = golden_response["question"]
    golden_answer = golden_response["response"]
    generated_answer = rag_response
    
    eval_result = evaluator.evaluate(query=query, reference=golden_answer, response=generated_answer)
    eval_results.append(eval_result)

In [41]:
[r.score for r in eval_results]

[4.5, 4.0, 3.0, 4.0, 3.5, 3.5, 4.0, 4.0, 3.0, 4.5]

In [42]:
scores = [
    {"question": golden_response["question"],
     "golden_response": golden_response["response"],
     "generated_response": eval_result.response,
     "score": eval_result.score,
     "reasoning": eval_result.feedback,
    }
    for eval_result, golden_response in zip(eval_results, golden_responses)
]

In [43]:
with open("sentence-window-eval-scores-mistral-exp2.json", "w") as file:
    json.dump(scores, file, indent=4)

In [44]:
average_scores = sum(score["score"] for score in scores) / len(scores)
average_scores

3.8

#### Using LlamaIndex Correctness Evaluator on User Responses Dataset

In [None]:
import json
with open("../datasets/sentence-window-eval-scores-mistral-exp2.json", "r") as file:
    pred_responses = json.load(file)

In [None]:
eval_results = []
from tqdm import tqdm
for pred_response, golden_response in tqdm(list(zip(pred_responses, golden_responses))):
    query = golden_response["question"]
    golden_answer = golden_response["response"]
    response = pred_response["generated_response"]
    
    eval_result = evaluator.evaluate(query=query, reference=golden_answer, response=response)
    eval_results.append(eval_result)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:31<00:00, 15.19s/it]


In [None]:
[r.score for r in eval_results]

[3.5, 3.5, 3.0, 3.0, 3.0, 3.5, 4.5, 3.5, 3.0, 4.5]

In [None]:
scores = [
    {"question": golden_response["question"],
     "golden_response": golden_response["response"],
     "generated_response": eval_result.response,
     "score": eval_result.score,
     "reasoning": eval_result.feedback,
    }
    for eval_result, golden_response in zip(eval_results, golden_responses)
]
with open("mistralsw2vshuman.json", "w") as file:
    json.dump(scores, file, indent=4)
average_scores = sum(score["score"] for score in scores) / len(scores)
average_scores

3.5

#### Industry Metrics on User Responses Dataset

In [8]:
import json
with open("../datasets/human1_responses.json", "r") as file:
    human1 = json.load(file)
with open("../datasets/human2_responses.json", "r") as file:
    human2 = json.load(file)
with open("../datasets/human3_responses.json", "r") as file:
    human3 = json.load(file)
with open("../datasets/sentence-window-eval-scores-mistral-exp2.json", "r") as file:
    eval = json.load(file)

human1_responses = []
human2_responses = []
human3_responses = []
eval_responses = []

for i in range(0, 10):
    human1_responses.append(human1[i]["response"])
    human2_responses.append(human2[i]["response"])
    human3_responses.append(human3[i]["response"])
    eval_responses.append(eval[i]["generated_response"])
    
references_dict = {
    "human_1": human1_responses,
    "human_2": human2_responses,
    "human_3": human3_responses,
}

In [9]:
from eval import generate_human_eval_summary
exp2_vs_humans_result = generate_human_eval_summary(references_dict, eval_responses, "Mistral Sentence Window RAG (Exp. 1)")

Calculating ROUGE Score...
Calculating BLEU Score...
Calculating BERT Score...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating METEOR Score...


[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /

In [10]:
print(exp2_vs_humans_result["rouge"])

  Mistral Sentence Window RAG (Exp. 1)    rouge1    rouge2    rougeL  \
0                              human_1  0.557320  0.389804  0.404088   
1                              human_2  0.566143  0.374323  0.395713   
2                              human_3  0.558848  0.436611  0.460138   

   rougeLsum  
0   0.403198  
1   0.395083  
2   0.456100  


In [11]:
print(exp2_vs_humans_result["bleu"])

  Mistral Sentence Window RAG (Exp. 1)      bleu
0                              human_1  0.227047
1                              human_2  0.227047
2                              human_3  0.227047


In [12]:
print(exp2_vs_humans_result["meteor"])

  Mistral Sentence Window RAG (Exp. 1)    meteor
0                              human_1  0.473438
1                              human_2  0.473438
2                              human_3  0.473438


### Evaluating Exp 1 vs Exp 2 Using industry standard metrics on the Golden Responses Dataset

In [1]:
import json
with open("../datasets/eval-scores-bare-mistral.json", "r") as file:
    bare_llm = json.load(file)
with open("../datasets/eval-scores-rag-mistral.json", "r") as file:
    rag = json.load(file)
with open("../datasets/sentence-window-eval-scores-mistral-exp1.json", "r") as file:
    sw_rag1 = json.load(file) 
with open("../datasets/sentence-window-eval-scores-mistral-exp2.json", "r") as file:
    sw_rag2 = json.load(file) 
with open("../datasets/golden-responses.json", "r") as file:
    golden = json.load(file)

rag_responses = []
sw1_rag_responses = []
sw2_rag_responses = []
bare_responses = []
golden_responses = []
for i in range(0, 10):
    rag_responses.append(rag[i]["generated_response"])
    sw1_rag_responses.append(sw_rag1[i]["generated_response"])
    sw2_rag_responses.append(sw_rag2[i]["generated_response"])
    bare_responses.append(bare_llm[i]["generated_response"])
    golden_responses.append(golden[i]["response"])
    
predictions_dict = {
    "Bare Mistral LLM": bare_responses,
    "Mistral + RAG": rag_responses,
    "Mistral Sentence Window RAG (Exp. 1)": sw1_rag_responses,
    "Mistral Sentence Window RAG (Exp. 2)": sw2_rag_responses
}
from eval import generate_metrics_summary
result = generate_metrics_summary(golden_responses, predictions_dict)

2023-12-07 09:55:03.108540: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-07 09:55:03.108620: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-07 09:55:03.109951: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-07 09:55:03.116197: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Calculating ROUGE Score...
Calculating BLEU Score...
Calculating BERT Score...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Calculating METEOR Score...


[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/aditi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/aditi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/aditi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /

In [4]:
result["meteor"].to_csv('r.csv', index=False)

In [50]:
print(result["rouge"])

                                 System    rouge1    rouge2    rougeL  \
0                      Bare Mistral LLM  0.253465  0.070145  0.175156   
1                         Mistral + RAG  0.622262  0.502128  0.483196   
2  Mistral Sentence Window RAG (Exp. 1)  0.632888  0.558515  0.577765   
3  Mistral Sentence Window RAG (Exp. 2)  0.575042  0.519628  0.535734   

   rougeLsum  
0   0.196274  
1   0.490470  
2   0.584620  
3   0.531742  


In [51]:
print(result["bleu"])

                                 System      bleu
0                      Bare Mistral LLM  0.039006
1                         Mistral + RAG  0.365213
2  Mistral Sentence Window RAG (Exp. 1)  0.404449
3  Mistral Sentence Window RAG (Exp. 2)  0.246032


In [52]:
print(result["meteor"])

                                 System    meteor
0                      Bare Mistral LLM  0.239231
1                         Mistral + RAG  0.609384
2  Mistral Sentence Window RAG (Exp. 1)  0.704675
3  Mistral Sentence Window RAG (Exp. 2)  0.615892


## Palm test

The correctness evaluator is sometimes finicky. Run below cells to make sure, you have enough API calls/have done the necessary setup.

### PLEASE PASTE YOUR OWN API KEY HERE

In [3]:
import google.generativeai as palm
palm_api_key = ""
palm.configure(api_key=palm_api_key)
from llama_index.llms.palm import PaLM

In [1]:
import nest_asyncio

nest_asyncio.apply()

In [4]:
import google.generativeai as palm
from llama_index.evaluation import CorrectnessEvaluator
palm.configure(api_key=palm_api_key)
from llama_index.llms.palm import PaLM
from llama_index import VectorStoreIndex, ServiceContext
eval_llm = PaLM(api_key=palm_api_key, temperature=0.0)
service_context = ServiceContext.from_defaults(llm=eval_llm)
evaluator = CorrectnessEvaluator(service_context=service_context)

In [5]:
eval_results = []
from tqdm import tqdm
for i in range(0,1):
    query = "hi"
    golden_answer = "What's UP"
    generated_answer = "hello"
    
    eval_result = evaluator.evaluate(query=query, reference=golden_answer, response=generated_answer)
    eval_results.append(eval_result)

In [6]:
print(eval_results)

[EvaluationResult(query='hi', contexts=None, response='hello', passing=False, feedback='The generated answer is relevant to the user query but it is not fully correct.', score=3.0)]
