In [1]:
!pip install graphrag --quiet


Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import pandas as pd
import tiktoken
from graphrag.query.indexer_adapters import read_indexer_entities, read_indexer_reports
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch
import subprocess
import util
from openai import OpenAI
import re
import os

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore



In [2]:
# concatenate all cross-reference text files
source_folders = ["requirements/FR","requirements/NFR","article_chunks"]
for f in source_folders:
    source_folder = f'documents/{f}'
    destination_folder = 'input'
    output_file = 'articles.txt'

    output_file_path = os.path.join(destination_folder, output_file)
    all_text = []

    for filename in os.listdir(source_folder):
        if filename.endswith('.txt'):
            file_path = os.path.join(source_folder, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                all_text.append(file.read())

    combined_text = '\n'.join(all_text)

    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(combined_text)



In [4]:
#remove uncommon characters 
def clean_text_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
    
    cleaned_content = ''.join(char for char in content if ord(char) < 128)
    
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)
input_file = "input/articles.txt"
clean_text_file(input_file, input_file)


In [13]:
# Generate a Graph RAG from the provided text file. or you can run "python -m graphrag.index --root ." in terminal.
# WARNING: Running this cell may incur significant costs depending on the size of your content and the model used.

command = ['python', '-m', 'graphrag.index', '--root', '.']
result = subprocess.run(command, capture_output=True, text=True)

if result.returncode == 0:
    print("Command executed successfully.")
else:
    print("Command failed with return code", result.returncode)
    print(result.stderr)  


Command executed successfully.


In [2]:
# moved files from output/****-****
INPUT_DIR = "output/graphs/GPT-UP/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
COMMUNITY_LEVEL = 2

report_df = pd.read_parquet(f"{INPUT_DIR}/create_final_community_reports.parquet")
entity_df = pd.read_parquet(f"{INPUT_DIR}/create_final_nodes.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/create_final_entities.parquet")
relationship_df = pd.read_parquet(f"{INPUT_DIR}/create_final_relationships.parquet")
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/create_final_text_units.parquet")
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
text_units = read_indexer_text_units(text_unit_df)
relationships = read_indexer_relationships(relationship_df)
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)


In [16]:
print(report_df)

   community                                       full_content  level  rank  \
0          0  # ISO/IEC 27001:2013(E) and Information Securi...      0   8.5   
1          1  # ISO/IEC 27001:2013 Information Security Mana...      0   8.5   
2         10  # Security in Development and Support Processe...      0   8.5   
3          2  # Information Security Management Systems and ...      0   8.5   
4          3  # ISO/IEC 27001:2013 and Related Standards Com...      0   8.5   
5          4  # Cryptographic Controls and ISO/IEC 27001:201...      0   7.5   
6          5  # ISO/IEC 27001:2013 and Information Security ...      0   8.5   
7          6  # Human Resource Security in ISO/IEC 27001:201...      0   7.5   
8          7  # Information Security Incident Management Com...      0   8.5   
9          8  # Information Security Management System and I...      0   8.5   
10         9  # ISO/IEC 2013 and Information Security Object...      0   8.5   

                                       

In [3]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = "gpt-4o-2024-08-06"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  
    max_retries=20,
)
token_encoder = tiktoken.get_encoding("cl100k_base")

reports = read_indexer_reports(report_df, entity_df, 2)
entities = read_indexer_entities(entity_df, entity_embedding_df, 2)
context_builder = GlobalCommunityContext(
    community_reports=reports,
    entities=entities,
    token_encoder=token_encoder,
)

context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 5000,
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 1000, 
    "temperature": 0.0,
}

search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=5000, 
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",
)

# Global query

In [8]:
folder_path = 'documents/requirements/FR'
output_folder_path = 'output/search_results/GPT-OP'

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            req_content = file.read()
            result = await search_engine.asearch(f"Identify content that are relevant to ensuring compliance or completeness of the following requirement: {req_content}")
    
            output_file_path = os.path.join(output_folder_path, filename)
            with open(output_file_path, 'w') as output_file:
                output_file.write(result.response)
                print(f"Written content to {output_file_path}")



Written content to output/search_results/GPT-OP\req_1.txt
Written content to output/search_results/GPT-OP\req_10.txt
Written content to output/search_results/GPT-OP\req_11.txt
Written content to output/search_results/GPT-OP\req_12.txt
Written content to output/search_results/GPT-OP\req_13.txt
Written content to output/search_results/GPT-OP\req_14.txt
Written content to output/search_results/GPT-OP\req_15.txt
Written content to output/search_results/GPT-OP\req_19.txt
Written content to output/search_results/GPT-OP\req_2.txt
Written content to output/search_results/GPT-OP\req_20.txt
Written content to output/search_results/GPT-OP\req_21.txt
Written content to output/search_results/GPT-OP\req_23.txt
Written content to output/search_results/GPT-OP\req_24.txt
Written content to output/search_results/GPT-OP\req_26.txt
Written content to output/search_results/GPT-OP\req_27.txt
Written content to output/search_results/GPT-OP\req_28.txt
Written content to output/search_results/GPT-OP\req_29.txt

In [19]:
# Optional: To get more context
df = pd.DataFrame(result.context_data['reports'])
context_df = df.sort_values(by='occurrence weight', ascending=False) 
filtered_df = context_df[context_df['occurrence weight'] > 0.7]
result_str =  result.response
context_df.head(10)

content_context = context_df["content"][0] + "\n" + result_str
sample_string = ""
for resp in result.map_responses:
    sample_string += str(resp)

pattern = re.compile(r"'answer':\s*'(.*?)',\s*'score':\s*(\d+)")
matches = pattern.findall(sample_string)

df = pd.DataFrame(matches, columns=['Answer', 'Score'])
df['Score'] = pd.to_numeric(df['Score'])
sorted_df = df.sort_values(by='Score', ascending=False) 

filtered_df = df[df['Score'] >= 0.8]
high_score_answer = '\n'.join(filtered_df['Answer'])
result_str = result_str + "\n\n" + high_score_answer
result_str


Unnamed: 0,id,title,occurrence weight,content,rank
25,39,ISO/IEC 27001:2013 Information Security Community,1.0,# ISO/IEC 27001:2013 Information Security Comm...,8.5
26,23,Information Security Management Community,0.352941,# Information Security Management Community\n\...,8.0
8,30,Documented Information and Information Securit...,0.205882,# Documented Information and Information Secur...,7.5
17,31,ISO/IEC 2013 Information Security Management C...,0.176471,# ISO/IEC 2013 Information Security Management...,7.5
9,21,Broker Back-Office Application Standards and C...,0.176471,# Broker Back-Office Application Standards and...,8.5
18,25,ISO Standards for Risk Management and Informat...,0.147059,# ISO Standards for Risk Management and Inform...,7.5
33,1,Information Security Risk Management Community,0.117647,# Information Security Risk Management Communi...,8.0
19,22,Regulatory Compliance and Reporting Standards,0.117647,# Regulatory Compliance and Reporting Standard...,8.5
10,26,Broker Back Office System Compliance and Funct...,0.088235,# Broker Back Office System Compliance and Fun...,7.5
27,36,ISO/IEC Standards for Information Security Man...,0.088235,# ISO/IEC Standards for Information Security M...,7.5


# Local Query

In [9]:
embedding_model = "text-embedding-3-small"
llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    #covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)


In [13]:
folder_path = 'documents/requirements/FR'
output_folder_path = 'output/search_results/GPT-OP'

for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            req_content = file.read()
            result = await search_engine.asearch(f"Identify content that are relevant to ensuring compliance or completeness of the following requirement: {req_content}")
    
            output_file_path = os.path.join(output_folder_path, filename)
            with open(output_file_path, 'a') as output_file:
                output_file.write(result.response)
                print(f"Written content to {output_file_path}")

Written content to output/search_results/GPT-OP\req_1.txt
Written content to output/search_results/GPT-OP\req_10.txt
Written content to output/search_results/GPT-OP\req_11.txt
Written content to output/search_results/GPT-OP\req_12.txt
Written content to output/search_results/GPT-OP\req_13.txt
Written content to output/search_results/GPT-OP\req_14.txt
Written content to output/search_results/GPT-OP\req_15.txt
Written content to output/search_results/GPT-OP\req_19.txt
Written content to output/search_results/GPT-OP\req_2.txt
Written content to output/search_results/GPT-OP\req_20.txt
Written content to output/search_results/GPT-OP\req_21.txt
Written content to output/search_results/GPT-OP\req_23.txt
Written content to output/search_results/GPT-OP\req_24.txt
Written content to output/search_results/GPT-OP\req_26.txt
Written content to output/search_results/GPT-OP\req_27.txt
Written content to output/search_results/GPT-OP\req_28.txt
Written content to output/search_results/GPT-OP\req_29.txt

In [73]:
#retrieving the original reference text
result.context_data["sources"]["text"][0]

" Account balance\n\nError Handling: If the account balance is not retrieved within a specified time, a new balance request is sent, and an appropriate error message is displayed. The error event is logged for further review.\nTitle  Transferring Funds for Selling Stocks or Bonds  \nDescription  The amount corresponding to the sold stocks is deposited into the customer's account, and \nthe amount is withdrawn from the buyer's account. The brokerage fee is also withdrawn \nfrom the buyer's account and deposited into the brokerage account.  \nInput  Customer account number, stock or bond amount, brokerage fee, buyer's account number, \nbrokerage account number  \nProcessing  During the sale process, the user selects one of their accounts. The account balance is \nchecked, and the system sets the amount for the sold stocks and the brokerage fee. The \nbrokerage account is recorded in the system's database and needs only to be retrieved. The \nstock exchange interface specifies the bank ac

# Global Query

In [102]:
# # find redundant or contradicting solutions. (self consistency check)
# directory = 'RAG_OUTPUT/SOLUTIONS/'
# text_files = [f for f in os.listdir(directory) if f.endswith('.txt')]
# result = ""
# if len(text_files) > 1:
#     for i in range(len(text_files)):
#         for j in range(i + 1, len(text_files)):
#             file1_path = os.path.join(directory, text_files[i])
#             file2_path = os.path.join(directory, text_files[j])
#             with open(file1_path, 'r', encoding='utf-8') as file1:
#                 file1_contents = file1.read()

#             with open(file2_path, 'r', encoding='utf-8') as file2:
#                 file2_contents = file2.read()
#             combined_contents = file1_contents + "\n" + file2_contents
#             response = call_LLM(prompt_template.prompt_user_self_consistency(combined_contents), system_prompt = prompt_template.prompt_consistency(""))
#             result += response.choices[0].message.content
# else:
#     file_path = os.path.join(directory, "sol_1.txt")
#     with open(file_path, 'r', encoding='utf-8') as file:
#         file_contents = file.read()
#         response = call_LLM(prompt_template.prompt_user_self_consistency(file_contents), system_prompt = prompt_template.prompt_consistency(""))
#         result = response.choices[0].message.content

# with open(f'RAG_OUTPUT/CONSISTENCY/SELF_CON/self_con_result_{req_num}.txt', 'w', encoding='utf-8') as confile:
#     confile.write(result)

In [103]:
# # df for self consistencies
# user_input = []
# content_from = []
# similarity_score = []
# similarity_number = []

# with open(f'RAG_OUTPUT/CONSISTENCY/SELF_CON/self_con_result_{req_num}.txt', 'r') as file:
#     lines = file.readlines()

# for line in lines:
#     if "Similarity #" in line:
#         sim_number = int(re.search(r'\d+', line).group())
#         similarity_number.append(sim_number)
#     elif "(suggested solution):" in line:
#         user_input.append(int(re.search(r'\d+', line).group()))
#     elif "Content from :" in line:
#         content_from.append(int(re.search(r'\d+', line).group()))
#     elif "Similarity Score:" in line:
#         similarity_score.append(int(re.search(r'\d+', line).group()))

# df = pd.DataFrame({
#     'Similarity Number': similarity_number,
#     'User Input (Suggested Solution)': user_input,
#     'Content From': content_from,
#     'Similarity Score': similarity_score
# })


# df.head(10)


In [104]:
# # remove redundant suggestions. conflicts are manually handled.
# content_from_list = df[df['Similarity Score'] >= 70]['Content From'].tolist()
# num_of_self_conflicts = len(df[df['Similarity Score'] == 0]['Content From'].tolist())
# num_of_self_redundencies = len(content_from_list)

# number_of_sol_files = len([f for f in os.listdir("RAG_OUTPUT/SOLUTIONS") if f.endswith('.txt')])


# for i in range(1,number_of_sol_files + 1):
#     with open(f'RAG_OUTPUT/SOLUTIONS/sol_{i}.txt', 'r') as file:
#         lines = file.readlines()

#         filtered_lines = [line for index, line in enumerate(lines, start=1) if index not in content_from_list]
#         with open(f'RAG_OUTPUT/SOLUTIONS/update/updated_sol_{i}.txt', 'w') as file:
#             file.writelines(filtered_lines)


In [105]:
# # consistency check between remaining solutions and existing requirement document.

# req_directory = 'documents/requirements/'
# sol_directory = 'RAG_OUTPUT/SOLUTIONS/update/'

# sol_files = [f for f in os.listdir(sol_directory) if f.endswith('.txt')]
# req_files = [f for f in os.listdir(req_directory) if f.endswith('.txt')]
# result = ""

# for i in range(len(sol_files)):
#     for j in range(len(req_files)):
#         sol_path = os.path.join(sol_directory, sol_files[i])
#         req_path = os.path.join(req_directory, req_files[j])
#         with open(sol_path, 'r', encoding='utf-8') as file1:
#             sol_contents = file1.read()
#         with open(req_path, 'r', encoding='utf-8') as file2:
#             req_contents = file2.read()
#         response = call_LLM(prompt_template.prompt_user_consistency(sol_contents), system_prompt = prompt_template.prompt_consistency(req_contents))
#         result += response.choices[0].message.content

# with open(f"RAG_OUTPUT/CONSISTENCY/con_res_{req_num}.txt", 'w', encoding='utf-8') as confile:
#     confile.write(result)


In [106]:
# # df for consistencies
# user_input = []
# content_from = []
# similarity_score = []
# similarity_number = []

# with open(f"RAG_OUTPUT/CONSISTENCY/con_res_{req_num}.txt", 'r') as file:
#     lines = file.readlines()

# for line in lines:
#     if "Similarity #" in line or "Contradiction #" in line:
#         sim_number = int(re.search(r'\d+', line).group())
#         similarity_number.append(sim_number)
#     elif "(suggested solution):" in line:
#         user_input.append(int(re.search(r'\d+', line).group()))
#     elif "Content from " in line:
#         content_from.append(int(re.search(r'\d+', line).group()))
#     elif "Similarity Score:" in line:
#         similarity_score.append(int(re.search(r'\d+', line).group()))
# df = pd.DataFrame({
#     'Similarity Number': similarity_number,
#     'User Input (Suggested Solution)': user_input,
#     'Content From': content_from,
#     'Similarity Score': similarity_score
# })

# content_from_list = df[df['Similarity Score'] >= 70]['Content From'].tolist()
# num_of_conflicts = len(df[df['Similarity Score'] == 0]['Content From'].tolist()) - num_of_self_conflicts
# num_of_redundencies = len(content_from_list) - num_of_self_redundencies

# df.head(10)
