In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

# Access the variables
openai_api_key = os.getenv('OPENAI_API_KEY')
data_dir = os.getenv('DATA_DIR')

print(f"OpenAI API Key: {openai_api_key}")
print(f"Data Directory: {data_dir}")


In [None]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("data/2q24-cfsu.md")
docs = loader.load()

In [None]:
from langchain_community.document_loaders import DirectoryLoader


chunked_loader = DirectoryLoader("data/chunked-markdown", loader_cls=TextLoader, glob="**/*.md")
chunked_docs = chunked_loader.load()

In [None]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader(data_dir, loader_cls=TextLoader, glob="**/*.md")
documents = loader.load()
documents

In [None]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=20, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

In [None]:
type(testset)

In [None]:
testset.to_json("testset.jsonl")

In [None]:
testset.to_json("testset.jsonl")

In [None]:
testset.to_json("testset.jsonl")

In [None]:
testset_df = testset.to_pandas()

In [None]:
testset_df.columns


In [None]:
testset_df.to_json("testset.jsonl")

In [None]:
from datasets import Dataset

Dataset.from_pandas(testset_df).to_json("testset.jsonl")

In [None]:
from dotenv import load_dotenv
from datasets import Dataset
import os
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

testset_json = os.getenv("TESTSET_JSON", "testset.jsonl")

testset = Dataset.from_json(testset_json)
Dataset.from_generator(testset)

score = evaluate(testset, metrics=[faithfulness, answer_correctness])
score.to_pandas().to_csv("score.csv",
                         index=False,
                         header=True)