In [1]:
!pip install langchain==0.2.16
!pip install langchain-community==0.2.17
!pip install langchain-openai==0.1.25
!pip install ragas==0.1.20

Collecting langchain
  Obtaining dependency information for langchain from https://files.pythonhosted.org/packages/51/3f/462c134228fbb4f65be0a9db6a651e2f1d7226d003a712f1bac455a141b7/langchain-0.3.1-py3-none-any.whl.metadata
  Using cached langchain-0.3.1-py3-none-any.whl.metadata (7.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Obtaining dependency information for SQLAlchemy<3,>=1.4 from https://files.pythonhosted.org/packages/1e/69/919673c5101a0c633658d58b11b454b251ca82300941fba801201434755d/SQLAlchemy-2.0.35-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Using cached SQLAlchemy-2.0.35-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.6 kB)
Using cached langchain-0.3.1-py3-none-any.whl (1.0 MB)
Using cached SQLAlchemy-2.0.35-cp311-cp311-macosx_11_0_arm64.whl (2.1 MB)
Installing collected packages: SQLAlchemy, langchain
Successfully installed SQLAlchemy-2.0.35 langchain-0.3.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;4

In [14]:
from dotenv import load_dotenv
import os

load_dotenv()

# Access the variables
openai_api_key = os.getenv('OPENAI_API_KEY')
data_dir = os.getenv('DATA_DIR')
output_dir = os.getenv('OUTPUT_DIR')
os.makedirs(output_dir, exist_ok=True)

In [7]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader(f"{output_dir}/md", loader_cls=TextLoader, glob="**/*.md")
documents = loader.load()
documents

[Document(metadata={'source': 'output/Samsung/md/document_21.md'}, page_content='|----|-----------------------------------------|--------------------|-----------------------|--------------------|-------------------------------------------------------------------------------------------------------------------|------------|\n| NO | Material Issues                         | Impact Materiality | Financial Materiality | Change in Priority | Key Activities                                                                                                    | References |\n| 1  | Information security                    | ●●●●◐              | ●●●●●                 | -                  | Promote global information security standards and strengthen disclosures                                          | 114-121    |\n| 2  | GHG emissions management and reduction  | ●●●●●              | ●●●●◐                 | Up (3→2)           | Carry out eco-friendly capital expenditures at data centers, install 

In [8]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
critic_llm = ChatOpenAI(model="gpt-4")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=20, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

  from .autonotebook import tqdm as notebook_tqdm
Filename and doc_id are the same for all nodes.                     
Generating: 100%|██████████| 20/20 [02:41<00:00,  8.09s/it]


In [18]:
testset_df = testset.to_pandas()
testset_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How does Samsung SDS aim to upgrade their DevS...,[\n\nSOCIAL\n\nGOVERNANCE\n\n## **Sustainable ...,Samsung SDS aims to upgrade their DevSecOps le...,simple,[{'source': 'output/Samsung/md/document_257.md'}],True
1,What is one of the factors that can be assesse...,[|-------------------------------------------|...,The work environment is one of the factors tha...,simple,[{'source': 'output/Samsung/md/document_787.md'}],True
2,What does the Certification Support System at ...,[## **Regional Specialist Program**\n\nSamsung...,The Certification Support System at Samsung SD...,simple,[{'source': 'output/Samsung/md/document_248.md'}],True
3,What is the role of the Board of Directors in ...,[Samsung SDS designates a supporting departmen...,The role of the Board of Directors in Samsung ...,simple,[{'source': 'output/Samsung/md/document_299.md'}],True
4,What is the purpose or function of the Steppin...,"[Self-reliance practice 2,600 people\n\nSteppi...",The purpose or function of the Stepping Stone ...,simple,[{'source': 'output/Samsung/md/document_463.md'}],True
5,What is the purpose of the training provided f...,[Samsung SDS designates a supporting departmen...,Samsung SDS designates a supporting department...,simple,[{'source': 'output/Samsung/md/document_482.md'}],True
6,What is the purpose of the RE:CYCLE campaign c...,"[In line with the ESG business trend, many com...",The purpose of the RE:CYCLE campaign conducted...,simple,[{'source': 'output/Samsung/md/document_52.md'}],True
7,What is the process for responding to personal...,[## **Process for Responding to Personal Data ...,In accordance with the Personal Information Pr...,simple,[{'source': 'output/Samsung/md/document_839.md'}],True
8,How many registered patents does Samsung SDS h...,"[As of the end of 2023, Samsung SDS operates o...","Total 4,758 persons (cumulative)",simple,[{'source': 'output/Samsung/md/document_189.md'}],True
9,What standards and guidelines does Samsung SDS...,[Suppliers must comply with all applicable law...,Samsung SDS requires suppliers to comply with ...,simple,[{'source': 'output/Samsung/md/document_674.md'}],True


In [16]:
testset_df.shape


(20, 6)

In [20]:
testset_df.to_json(f"{output_dir}/testset.json")