In [1]:
import os
import pandas as pd
import tiktoken
from graphrag.query.indexer_adapters import read_indexer_entities, read_indexer_reports
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch
import subprocess
import io
import sys
import util
import prompt_template
from openai import OpenAI
import re
import api

In [2]:
#remove uncommon characters 
def clean_text_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.read()
    
    cleaned_content = ''.join(char for char in content if ord(char) < 128)
    
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)
input_file = "input/articles.txt"
#clean_text_file(input_file, input_file)


In [3]:
def run_graphrag_index():
    try:
        subprocess.run([sys.executable, "-m", "graphrag.index", "--root", "."], 
                                capture_output=False, text=True, check=True)
        print("Command executed successfully.")
    except subprocess.CalledProcessError as e:
        print("An error occurred while running the command:")
        print("Error output:", e.stderr)
    except FileNotFoundError:
        print("Error: graphrag module not found. Make sure it's installed and in your Python path.")

#run_graphrag_index()

In [4]:
report_df = pd.read_parquet(f"GPT-4-turbo-indexing/artifacts/create_final_community_reports.parquet")
report_df

Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,0,# Regulatory Compliance and Relationships in t...,0,7.5,Regulatory Compliance and Relationships in the...,The impact severity rating is high due to the ...,This report examines the intricate network of ...,"[{'explanation': 'Brokers, including broker/de...","{\n ""title"": ""Regulatory Compliance and Rel...",1abd8f20-01b6-4cb5-8dee-eee69fa31ce3
1,1,# Regulatory and Operational Dynamics in the T...,0,8.5,Regulatory and Operational Dynamics in the Tra...,The high impact severity rating reflects the c...,This report delves into the intricate network ...,[{'explanation': 'Broker/Dealers occupy a cent...,"{\n ""title"": ""Regulatory and Operational Dy...",e451bdd5-b8b0-48c3-8ab1-8505357d9ee7
2,2,# Regulatory Framework in Financial Trading\n\...,0,8.5,Regulatory Framework in Financial Trading,The high impact severity rating reflects the c...,This report outlines the intricate relationshi...,[{'explanation': 'The Organization is at the h...,"{\n ""title"": ""Regulatory Framework in Finan...",b7cd43b6-8cd2-4578-8702-cf96dda0890b
3,3,# Online Trading Ecosystem Analysis\n\nThis re...,0,7.5,Online Trading Ecosystem Analysis,The impact severity rating is relatively high ...,This report delves into the structured relatio...,"[{'explanation': 'Customers, both individuals ...","{\n ""title"": ""Online Trading Ecosystem Anal...",0675a3fe-2a43-4f12-9e4f-7ffac697454a


In [5]:
report_df = pd.read_parquet(f"GPT-4o-mini-indexing/artifacts/create_final_community_reports.parquet")
report_df

Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,66,# Organization and Information Security Manage...,2,8.5,Organization and Information Security Management,The impact severity rating is high due to the ...,The community centers around the concept of an...,[{'explanation': 'Organizations are pivotal in...,"{\n ""title"": ""Organization and Information ...",9a3533a6-0cc3-4fb6-9662-18e6766262fb
1,67,# Information Security Competence and Awarenes...,2,7.5,Information Security Competence and Awareness,The impact severity rating is high due to the ...,This community focuses on the critical aspects...,[{'explanation': 'Competence is a fundamental ...,"{\n ""title"": ""Information Security Competen...",e94da1ed-34fb-4a9d-be5f-f86c56b030c3
2,68,# Information Systems and Records Management\n...,2,7.5,Information Systems and Records Management,The impact severity rating is high due to the ...,The community focuses on the relationship betw...,[{'explanation': 'Information systems are esse...,"{\n ""title"": ""Information Systems and Recor...",c03614a3-841c-42dc-b079-30c2da132c38
3,69,# Information Security Management System Commu...,2,8.0,Information Security Management System Community,The impact severity rating is high due to the ...,The community is centered around the Informati...,[{'explanation': 'The Information Security Man...,"{\n ""title"": ""Information Security Manageme...",0bdc5f02-ea5b-4137-9fa9-c26029978f77
4,70,# Information Security Management System and S...,2,7.5,Information Security Management System and Sta...,The impact severity rating is high due to the ...,The community focuses on the Information Secur...,[{'explanation': 'Interested Parties play a cr...,"{\n ""title"": ""Information Security Manageme...",4f21a029-e29c-4193-b1ed-6c986c6a9fb0
...,...,...,...,...,...,...,...,...,...,...
62,5,# Brokerage and Trading Community\n\nThe commu...,0,8.5,Brokerage and Trading Community,The impact severity rating is high due to the ...,The community encompasses various entities inv...,[{'explanation': 'Brokers serve as essential i...,"{\n ""title"": ""Brokerage and Trading Communi...",c5a7b791-3cb2-4797-ad31-cc1b2eb394c5
63,6,# Information Security Incident Management Com...,0,7.5,Information Security Incident Management Commu...,The impact severity rating is high due to the ...,The community focuses on the management of inf...,[{'explanation': 'Information Security Inciden...,"{\n ""title"": ""Information Security Incident...",fb1dcec7-17bf-44f3-8684-323d99f9e70e
64,7,# Information Security Management Community\n\...,0,7.5,Information Security Management Community,The impact severity rating is high due to the ...,The community focuses on information security ...,[{'explanation': 'Information Classification i...,"{\n ""title"": ""Information Security Manageme...",37120226-8608-4358-a1c5-7afecbbeac48
65,8,# Information Security Community\n\nThe Inform...,0,8.5,Information Security Community,The impact severity rating is high due to the ...,The Information Security Community comprises v...,[{'explanation': 'Information security is the ...,"{\n ""title"": ""Information Security Communit...",41e6583c-fa02-4c8a-aa60-3727de7ba229


In [6]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = "gpt-4o-mini-2024-07-18"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

In [7]:
INPUT_DIR = "./GPT-4o-mini-indexing/artifacts"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
COMMUNITY_LEVEL = 2

In [8]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 67


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,66,# Organization and Information Security Manage...,2,8.5,Organization and Information Security Management,The impact severity rating is high due to the ...,The community centers around the concept of an...,[{'explanation': 'Organizations are pivotal in...,"{\n ""title"": ""Organization and Information ...",9a3533a6-0cc3-4fb6-9662-18e6766262fb
1,67,# Information Security Competence and Awarenes...,2,7.5,Information Security Competence and Awareness,The impact severity rating is high due to the ...,This community focuses on the critical aspects...,[{'explanation': 'Competence is a fundamental ...,"{\n ""title"": ""Information Security Competen...",e94da1ed-34fb-4a9d-be5f-f86c56b030c3
2,68,# Information Systems and Records Management\n...,2,7.5,Information Systems and Records Management,The impact severity rating is high due to the ...,The community focuses on the relationship betw...,[{'explanation': 'Information systems are esse...,"{\n ""title"": ""Information Systems and Recor...",c03614a3-841c-42dc-b079-30c2da132c38
3,69,# Information Security Management System Commu...,2,8.0,Information Security Management System Community,The impact severity rating is high due to the ...,The community is centered around the Informati...,[{'explanation': 'The Information Security Man...,"{\n ""title"": ""Information Security Manageme...",0bdc5f02-ea5b-4137-9fa9-c26029978f77
4,70,# Information Security Management System and S...,2,7.5,Information Security Management System and Sta...,The impact severity rating is high due to the ...,The community focuses on the Information Secur...,[{'explanation': 'Interested Parties play a cr...,"{\n ""title"": ""Information Security Manageme...",4f21a029-e29c-4193-b1ed-6c986c6a9fb0


In [9]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    entities=entities,
    token_encoder=token_encoder,
)

In [10]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 5000,
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 1000, 
    "temperature": 0.0,
}

In [15]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=5000, 
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=False,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",
)

In [None]:
#Report utils
num_of_LLM_calls = 0
num_of_comp_violations = 0
num_of_part_violations = 0
num_of_compliances = 0
num_of_self_redundencies = 0
num_of_redundencies = 0
num_of_self_conflicts = 0
num_of_conflicts = 0

In [60]:
results = []
with open("documents/requirements/FR/req_1.txt", mode="r", encoding="utf-8") as req_file:
    req_content = req_file.read()
    result = await search_engine.asearch(f"I want you to find all the entities that can be related to {req_content}.")
    num_of_LLM_calls += result.llm_calls
    results.append(result)

##DEBUG##: {
    "points": [
        {"description": "The process of logging out of the system involves user engagement with the account management features, which is crucial for maintaining security and user control over their accounts. This engagement is essential for ensuring that users can manage their access effectively [Data: Reports (33)]", "score": 70},
        {"description": "Error handling during the logout process is important as it ensures that users are informed of any issues that may arise, thereby enhancing user experience and trust in the system. Displaying appropriate error messages is a critical aspect of user interface design [Data: Reports (33)]", "score": 60}
    ]
}
##DEBUG##: {
    "points": [
        {"description": "The process of logging out of the system involves user interactions with the Account Maintenance Page, which serves as a central hub for managing account settings, including the logout option. This relationship emphasizes the importance of user exp

In [2]:
df = pd.DataFrame(results[0].context_data['reports'])
context_df = df.sort_values(by='occurrence weight', ascending=False) 


filtered_df = context_df[context_df['occurrence weight'] > 0.6]
result_str = '\n'.join(filtered_df['content'])
content_context = context_df["content"][0] + "\n" + result_str

print(content_context)
context_df.head(10)

NameError: name 'result' is not defined

In [100]:
sample_string = ""
for resp in result.map_responses:
    sample_string += str(resp)

pattern = re.compile(r"'answer':\s*'(.*?)',\s*'score':\s*(\d+)")
matches = pattern.findall(sample_string)

df = pd.DataFrame(matches, columns=['Answer', 'Score'])
df['Score'] = pd.to_numeric(df['Score'])
sorted_df = df.sort_values(by='Score', ascending=False) 

filtered_df = df[df['Score'] >= 0.7]
result_str = '\n'.join(filtered_df['Answer'])
content_context = content_context + "\n" + result_str

sorted_df

Unnamed: 0,Answer,Score
6,The system is the entity that processes the lo...,80
8,The login process is a critical step for users...,80
0,The process of logging out of the system invol...,70
2,The process of logging out of the system invol...,70
4,Error handling during the logout process is cr...,70
5,Error handling during the logout process is cr...,70
7,Error handling is an important aspect of the l...,70
1,Error handling during the logout process is cr...,60
3,Error handling during the logout process is im...,60
9,The Secure Development Policy emphasizes the n...,60


In [19]:
# call GPT-4o
def call_LLM(user_prompt, system_prompt):
    client = OpenAI(api_key=api.API_KEY)
    response = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18", 
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        #max_tokens = 4096,  
        temperature = 0.2
    )
    global num_of_LLM_calls
    num_of_LLM_calls += 1
    return response


In [None]:
# like GPT4o.ipynb but system prompt is the reduced content