# Evaluating RAGAS for test dataset

In [None]:
import toml, os, sys

sys.path.append(os.path.realpath("../"))

with open("../.streamlit/secrets.toml") as f:
    secrets = toml.load(f)
    
os.environ["OPENAI_API_KEY"] = secrets["OPENAI_API_KEY"]
if secrets.get("LANGCHAIN_API_KEY"):
    os.environ["LANGCHAIN_API_KEY"] = secrets["LANGCHAIN_API_KEY"]
    os.environ["LANGCHAIN_TRACING_V2"] = "false"
    os.environ["LANGCHAIN_ENDPOINT"] = secrets["LANGCHAIN_ENDPOINT"]

os.environ["PINECONE_API_KEY"] = secrets["PINECONE_API_KEY"]

In [None]:
# attach to the existing event loop when using jupyter notebooks
import nest_asyncio

nest_asyncio.apply()

# Get the dataset
from langsmith import Client
from langsmith.utils import LangSmithError

client = Client()


In [None]:
import pandas as pd
df = pd.read_csv("AI4EIC2023_DATASETS.csv", sep = ",")

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from streamlit_app.app_utilities import *
from streamlit_app.LangChainUtils.LLMChains import *
from langchain import callbacks
from langsmith import Client
from langchain_core.tracers.context import tracing_v2_enabled
from langchain.callbacks.tracers import LangChainTracer

def RunQuery(input_question, max_k, sim_score):
    
    llm = ChatOpenAI(model_name="gpt-3.5-turbo-1106", temperature=0,
                    max_tokens = 4096
                    )
    embeddings = OpenAIEmbeddings()
    # Defining some props of DB
    SimilarityDict = {"Cosine similarity" : "similarity", "MMR" : "mmr"}

    DBProp = {"PINECONE" : {"vector_config" : {"db_api_key" : secrets["PINECONE_API_KEY"], 
                                                "index_name" : "llm-project", 
                                                "embedding_function" : embeddings
                                                },
                            "search_config" : {"metric" : sim_score, 
                                            "search_kwargs" : {"k" : max_k}
                                            },
                            "available_metrics" : ["Cosine similarity", "MMR"]
                            },
            }
    retriever = GetRetriever("PINECONE", DBProp["PINECONE"]["vector_config"], DBProp["PINECONE"]["search_config"])
    project_name = f"RAG-CHAT-ksuresh"
    tracer = LangChainTracer(project_name = project_name)
    run_name = "Evaluation-testings"
    trace_metadata = {"DBType": "PINECONE", 
                    "similarity_score": sim_score, 
                    "max_k": max_k
                    }
    RUNCHAIN = RunChatBot(llm, retriever, "/mnt/d/LLM-Project/EIC-RAG-Project/Templates"
                        ).with_config({"callbacks": [tracer], 
                                        "run_name": run_name,
                                        "metadata": trace_metadata}
                                        )
    trace_id = ""
    response = ""
    runid = ""
    with tracing_v2_enabled(project_name) as cb:
        with callbacks.collect_runs() as ccb:
            output = RUNCHAIN.invoke(input_question)
            response = output["answer"]
            print (output)
            print (len(ccb.traced_runs))
            for run in ccb.traced_runs:
                runid = run.id
                print (run.name)
                print (run.id)
                print (run.inputs)
                print (run.outputs)
                print (run.trace_id)
                trace_id = run.trace_id
    return response, trace_id, client.share_run(runid)

def RunLLM(input_question, GPTMODEL = 3):
    model_name = f"gpt-3.5-turbo-1106" if GPTMODEL == 3 else "gpt-4-0125-preview"
    print (input_question)
    llm = ChatOpenAI(model_name=model_name, temperature=0,
                    max_tokens = 4096
                    )
    output = llm.invoke(input_question).content
    return output

In [None]:
import pickle
from datasets import Dataset

from langchain_openai import OpenAIEmbeddings
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness
)

import ragas
embeddings = OpenAIEmbeddings()
ANSWER_CORRECTNESS = ragas.metrics.AnswerCorrectness(name = "ANSWER_CORRECTNESS",
                                                     weights = [0.90, 0.10]
                                                     )
ANSWER_RELEVANCY = ragas.metrics.AnswerRelevancy(name = "ANSWER_RELEVANCY",
                                                strictness = 5
                                                )
CONTEXT_ENTITY_RECALL = ragas.metrics.ContextEntityRecall(name = "CONTEXT_ENTITY_RECALL"
                                                         )
CONTEXT_PRECISION = ragas.metrics.ContextPrecision(name = "CONTEXT_PRECISION"
                                                    )
CONTEXT_RECALL = ragas.metrics.ContextRecall(name = "CONTEXT_RECALL"
                                             )
CONTEXT_RELEVANCY = ragas.metrics.ContextRelevancy(name = "CONTEXT_RELEVANCY"
                                                   )
FAITHFULNESS = ragas.metrics.Faithfulness(name = "FAITHFULNESS")

import pandas as pd
df = pd.read_csv("AI4EIC2023_DATASETS.csv", sep = ",")
from ragas import evaluate
dataset = {"question": [], "answer": [], "contexts": [], "ground_truth": [], "arxiv_id": [], "input_arxiv_id": [], "trace_links": []}
max_k = 3
sim_score = "mmr"
if (os.path.exists(f"results_k_{max_k}_sim_{sim_score}.csv")):
    os.system(f"rm -f results_k_{max_k}_sim_{sim_score}.csv")
for index, row in df.iterrows():
    question = row["input_question"]
    answer, trace_id, trace_link = RunQuery(question, max_k, sim_score)

    project_name = f"RAG-CHAT-ksuresh"
    run_name = "Evaluation-testings"
    runs = client.list_runs(project_name = project_name, trace_id = trace_id)
    contexts = []
    cite_arxiv_ids = []
    for run in runs:
        if (run.run_type.lower() == "retriever"):
            print (run.name)
            print (run.id)
            print (run.inputs)
            print (run.outputs)
            for i in run.outputs['documents']:
                contexts.append(i["page_content"])
                cite_arxiv_ids.append(i["metadata"]["arxiv_id"].split("/")[-1].strip())
            print (run.trace_id)
            print ("-----")
    dataset["question"].append(question)
    print (answer.split("http://")[0].strip("\n"))
    dataset["answer"].append(answer.split("http://")[0].strip("\n"))
    dataset["contexts"].append(contexts)
    dataset["ground_truth"].append(row["output_complete_response"])
    dataset["input_arxiv_id"].append(row["input_arxiv_id"])
    dataset["arxiv_id"].append(cite_arxiv_ids)
    dataset["trace_links"].append(trace_link)
    
    with open(f"dataset_k_{max_k}_sim_{sim_score}.pkl", "wb") as f:
        pickle.dump(dataset, f)
    
    tmpdataset = {}
    for key, value in dataset.items():
        tmpdataset[key] = [value[-1]]
    DATASET = Dataset.from_dict(tmpdataset)
    
    result = evaluate(DATASET,
                  metrics = [
                      FAITHFULNESS,
                      CONTEXT_RELEVANCY,
                      CONTEXT_ENTITY_RECALL,
                      CONTEXT_PRECISION,
                      CONTEXT_RECALL,
                      ANSWER_RELEVANCY,
                      ANSWER_CORRECTNESS
                  ]
                  )
    result_df = result.to_pandas()
    if (os.path.exists(f"results_k_{max_k}_sim_{sim_score}.csv")):
        df = pd.read_csv(f"results_k_{max_k}_sim_{sim_score}.csv", sep = ",")
        result_df = pd.concat([df, result_df])
    result_df.to_csv(f"results_k_{max_k}_sim_{sim_score}.csv", index = False)

In [None]:
import pickle
from datasets import Dataset
from langchain_openai import OpenAIEmbeddings
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness
)


from ragas import evaluate
dataset3 = {"question": [], "answer": [], "contexts": [], "ground_truth": [], "arxiv_id": [], "input_arxiv_id": [], "trace_links": []}
dataset4 = {"question": [], "answer": [], "contexts": [], "ground_truth": [], "arxiv_id": [], "input_arxiv_id": [], "trace_links": []}
import pandas as pd
df = pd.read_csv("AI4EIC2023_DATASETS.csv", sep = ",")

if (os.path.exists(f"results_gpt3.csv")):
    os.system(f"rm -f results_gpt3.csv")
if (os.path.exists(f"results_gpt4.csv")):
    os.system(f"rm -f results_gpt4.csv")
for index, row in df.iterrows():
    question = row["input_question"]
    output3 = RunLLM(question, GPTMODEL = 3)
    output4 = RunLLM(question, GPTMODEL = 4)
    dataset3["question"].append(question)
    dataset3["answer"].append(output3)
    dataset3["contexts"].append([row["output_complete_response"]])
    dataset3["ground_truth"].append(row["output_complete_response"])
    dataset3["input_arxiv_id"].append(row["input_arxiv_id"])
    dataset3["arxiv_id"].append([""])
    dataset3["trace_links"].append("")
    
    dataset4["question"].append(question)
    dataset4["answer"].append(output4)
    dataset4["contexts"].append([row["output_complete_response"]])
    dataset4["ground_truth"].append(row["output_complete_response"])
    dataset4["input_arxiv_id"].append(row["input_arxiv_id"])
    dataset4["arxiv_id"].append([""])
    dataset4["trace_links"].append("")
    
    tmpdataset3 = {}
    for key, value in dataset3.items():
        tmpdataset3[key] = [value[-1]]
    DATASET3 = Dataset.from_dict(tmpdataset3)
    embeddings = OpenAIEmbeddings()
    result3 = evaluate(DATASET3,
                    metrics = [
                        FAITHFULNESS,
                        CONTEXT_RELEVANCY,
                        CONTEXT_ENTITY_RECALL,
                        CONTEXT_PRECISION,
                        CONTEXT_RECALL,
                        ANSWER_RELEVANCY,
                        ANSWER_CORRECTNESS
                    ],
                    embeddings = embeddings
                    )
    result_df3 = result3.to_pandas()
    if (os.path.exists(f"results_gpt3.csv")):
        df = pd.read_csv(f"results_gpt3.csv", sep = ",")
        result_df3 = pd.concat([df, result_df3])
    result_df3.to_csv(f"results_gpt3.csv", index = False)
    
    tmpdataset4 = {}
    for key, value in dataset4.items():
        tmpdataset4[key] = [value[-1]]
    DATASET4 = Dataset.from_dict(tmpdataset4)
    
    result4 = evaluate(DATASET4,
                    metrics = [
                        FAITHFULNESS,
                        CONTEXT_RELEVANCY,
                        CONTEXT_ENTITY_RECALL,
                        CONTEXT_PRECISION,
                        CONTEXT_RECALL,
                        ANSWER_RELEVANCY,
                        ANSWER_CORRECTNESS
                    ],
                    embeddings = embeddings
                    )
    result_df4 = result4.to_pandas()
    if (os.path.exists(f"results_gpt4.csv")):
        df = pd.read_csv(f"results_gpt4.csv", sep = ",")
        result_df4 = pd.concat([df, result_df4])
    result_df4.to_csv(f"results_gpt4.csv", index = False)
    

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
max_k = 3
sim_score = "mmr"
%matplotlib inline
def analysis(df_list, labels = ["RAG4EIC"]):
  sns.set_style("whitegrid")
  df = df_list[0]
  fig, axs = plt.subplots(df.columns.size, 1, figsize=(5, 5*df.columns.size))
  colors = ["orange", "blue", "green"]
  for i,col in enumerate(df.columns):
    #sns.kdeplot(data=[df[col].values],legend=False,ax=axs[i],fill=True)
    sns.histplot(data=[df[col].values for df in df_list], kde=True, ax=axs[i], color = colors, alpha = 0.5)
    axs[i].set_title(f'{col} scores distribution')
    axs[i].legend(labels=labels)
    axs[i].set_xlabel("Score")
    axs[i].set_xlim(0., 1.1)
  plt.tight_layout()
  plt.show()

#columns = ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall', 'answer_correctness']
#FAITHFULNESS,CONTEXT_RELEVANCY,CONTEXT_ENTITY_RECALL,CONTEXT_PRECISION,CONTEXT_RECALL,ANSWER_RELEVANCY,ANSWER_CORRECTNESS
columns = ["FAITHFULNESS", "CONTEXT_RELEVANCY", "CONTEXT_ENTITY_RECALL", "CONTEXT_PRECISION", "CONTEXT_RECALL", "ANSWER_RELEVANCY", "ANSWER_CORRECTNESS"]
result_df = pd.read_csv(f"results_k_{max_k}_sim_{sim_score}.csv", sep = ",")[columns]
result_df.fillna(0, inplace = True)
gpt3_df = pd.read_csv(f"results_gpt3.csv", sep = ",")[columns]
gpt3_df.fillna(0, inplace = True)
#gpt4_df = pd.read_csv(f"results_gpt3.csv", sep = ",")[columns]
analysis(
    [result_df, gpt3_df],
    ["RAG4EIC", "GPT3"]
) 


In [None]:
result_df.plot(kind = "box", figsize = (20, 5), title = "RAG4EIC")
gpt3_df.plot(kind = "box", figsize = (20, 5), title = "GPT3")

In [None]:
result_df.describe()

In [None]:
gpt3_df.describe()

In [None]:
gpt4_df.describe()

In [None]:
gpt3_df = pd.read_csv(f"results_gpt3.csv", sep = ",")[["answer", "ground_truth"]]
print (gpt3_df.loc[0].answer)
print (gpt3_df.loc[0].ground_truth)

In [None]:
gpt3_df[["answer", "ground_truth"]]

In [None]:
gpt3_df.columns