### Os, Sync, Env

In [1]:
import os
import dotenv
dotenv.load_dotenv(dotenv_path="../.env")
import nest_asyncio
nest_asyncio.apply()

### Lading docs from llama index persisted doc_store

In [2]:
from llama_index.core.storage.docstore import SimpleDocumentStore
docstore = SimpleDocumentStore.from_persist_dir(persist_dir="D:/Learning New/GenAI/Project_RAG/RAG/Data_cleaning/doc_store")
documents = list(docstore.docs.values())

### Converting to Langchain Document format

In [3]:
from langchain.schema import Document
langchain_docs = [Document(page_content=doc.text, metadata=doc.metadata) for doc in documents]

### Azure Configuration

In [4]:
import os
azure_api_key = os.environ["AZURE_OPENAI_API_KEY"]
azure_base_url = os.environ["AZURE_BASE_URL"]

azure_configs = {
    "azure_api_key":azure_api_key,
    "base_url": azure_base_url,
    "generator_model_deployment": "gpt-4o-mini", 
    "generator_model_name": "gpt-4o-mini",
    "embedding_deployment": "text-embedding-ada-002",
    "embedding_name": "text-embedding-ada-002",
    "evaluator_model_name": "gpt-35-turbo-16k",
    "evaluator_model_deployment": "gpt-35-turbo-16k"
}

### LLMs & Embedding Model

In [5]:
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
generator_llm = LangchainLLMWrapper(AzureChatOpenAI(
    openai_api_version="2023-05-15",
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["generator_model_deployment"],
    model=azure_configs["generator_model_name"],
    validate_base_url=False,
))
generator_embeddings = LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(
    openai_api_version="2023-05-15",
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["embedding_deployment"],
    model=azure_configs["embedding_name"],
))

  from .autonotebook import tqdm as notebook_tqdm


### Testset generation

In [6]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(langchain_docs, testset_size=10)

Generating personas: 100%|██████████| 3/3 [00:01<00:00,  1.98it/s]                                             
Generating Scenarios: 100%|██████████| 2/2 [02:00<00:00, 60.40s/it] 
Generating Samples: 100%|██████████| 10/10 [01:03<00:00,  6.30s/it]


### Convert to evaluation dataset

In [9]:
dataset_eval = dataset.to_evaluation_dataset()
df = dataset_eval.to_pandas()
df = df.drop(columns=['reference_contexts'])
print(df)

                                          user_input  \
0                  What products have 0.1mg in them?   
1  What is the range of the temperature setting f...   
2  Can you elaborate on the specifications and fe...   
3                 Wht is the pan size of the device?   
4  What is the maximum temperature for operating ...   
5  What are the operating temperature specificati...   
6  Wht are the power supply requirements for the ...   
7  What is the difference in repeatability betwee...   
8  What are the readability specifications for th...   
9  What are the differences in readability and re...   

                                           reference  
0  The products that have 0.1mg in them are the M...  
1  The temperature setting for the device is 40°C...  
2  The analytical device classified under HSN: 90...  
3                The pan size of the device is 90mm.  
4  The maximum operating temperature for the devi...  
5  Both analytical devices mentioned in the conte... 

### Save

In [11]:
df.to_csv("D:/Learning New/GenAI/Project_RAG/RAG/1. Data/evaluator2_10.csv",index=False)