# Graph Generation Using LLM- Aniket, Sahil

In [None]:
import os
import openai
from string import Template
import json
import glob
from timeit import default_timer as timer
from dotenv import load_dotenv
from time import sleep

# Settings

In [None]:
# Load environment variables
load_dotenv()

True

In [None]:
# OpenAI API configuration
openai.api_type = "azure"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
openai_deployment = "chat-gpt35"

### 2. Helper Functions

In [None]:
# Function to call the OpenAI API
def process_gpt(file_prompt, system_msg):
    """
    Calls the OpenAI API to process the given prompt and system message.

    Args:
        file_prompt (str): The prompt to be sent to the OpenAI API.
        system_msg (str): The system message to set the context for the API.

    Returns:
        str: The API response content.
    """
    # Create a completion request to the OpenAI API
    completion = openai.ChatCompletion.create(
        engine=openai_deployment,
        max_tokens=15000,
        temperature=0,
        messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": file_prompt},
        ],
    )
    # Get the content of the API response
    nlp_results = completion.choices[0].message.content
    # Wait for 8 seconds to avoid hitting rate limits
    sleep(8)
    return nlp_results

# Function to take a folder of files and a prompt template, and return a json object of all the entities and relationships
def extract_entities_relationships(folder, prompt_template):
    """
    Extracts entities and relationships from files in a specified folder using a prompt template.

    Args:
        folder (str): The folder containing the files to be processed.
        prompt_template (str): The template for creating prompts.

    Returns:
        list: A list of JSON objects containing extracted entities and relationships.
    """
    # Start the timer to measure the pipeline execution time
    start = timer()
    # Get all files in the specified folder
    files = glob.glob(f"./data/{folder}/*")
    # Define the system message for setting the context
    system_msg = "You are a helpful IT-project and account management expert who extracts information from documents."
    print(f"Running pipeline for {len(files)} files in {folder} folder")
    results = []
    # Process each file in the folder
    for i, file in enumerate(files):
        print(f"Extracting entities and relationships for {file}")
        try:
            # Read the content of the file
            with open(file, "r") as f:
                text = f.read().rstrip()
                # Create the prompt using the template
                prompt = Template(prompt_template).substitute(ctext=text)
                # Call the OpenAI API to process the prompt
                result = process_gpt(prompt, system_msg=system_msg)
                # Append the result to the list
                results.append(json.loads(result))
        except Exception as e:
            # Handle any exceptions that occur during file processing
            print(f"Error processing {file}: {e}")
    # End the timer and print the total execution time
    end = timer()
    print(f"Pipeline completed in {end-start} seconds")
    return results

# Final function to bring all the steps together
def ingestion_pipeline(folders):
    """
    Combines the extracted entities and relationships from multiple folders into a single JSON object.

    Args:
        folders (dict): A dictionary where the keys are folder names and values are prompt templates.

    Returns:
        list: A combined list of JSON objects from all folders.
    """
    # Initialize an empty list to store the combined entities and relationships
    entities_relationships = []
    # Iterate through each folder and its corresponding prompt template
    for key, value in folders.items():
        # Extract entities and relationships from the current folder and extend the list
        entities_relationships.extend(extract_entities_relationships(key, value))
    return entities_relationships

### 3. Defining Prompts

In [None]:
# Prompt for processing project briefs
project_prompt_template = """
From the Project Brief below, extract the following Entities & relationships described in the mentioned format
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. Document must be summarized and stored inside Project entity under `summary` property. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Project',id:string,name:string;summary:string //Project mentioned in the brief; `id` property is the full name of the project, in lowercase, with no capital letters, special characters, spaces or hyphens; Contents of original document must be summarized inside 'summary' property
    label:'Technology',id:string,name:string //Technology Entity; `id` property is the name of the technology, in camel-case. Identify as many of the technologies used as possible
    label:'Client',id:string,name:string;industry:string //Client that the project was done for; `id` property is the name of the Client, in camel-case; 'industry' is the industry that the client operates in, as mentioned in the project brief.

2. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    project|USES_TECH|technology
    project|HAS_CLIENT|client


3. The output should look like :
{
    "entities": [{"label":"Project","id":string,"name":string,"summary":string}],
    "relationships": ["projectid|USES_TECH|technologyid"]
}

Case Sheet:
$ctext
"""


# Prompt for processing peoples' profiles
people_prompt_template = """From the list of people below, extract the following Entities & relationships described in the mentioned format
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Person',id:string,name:string //Person that the data is about. `id` property is the name of the person, in camel-case. 'name' is the person's name, as spelled in the text.
    label:'Project',id:string,name:string;summary:string //Project mentioned in the profile; `id` property is the full lowercase name of the project, with no capital letters, special characters, spaces or hyphens.
    label:'Technology',id:string,name:string //Technology Entity, as listed in the "skills"-section of every person; `id` property is the name of the technology, in camel-case.

3. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    person|HAS_SKILLS|technology
    project|HAS_PEOPLE|person


The output should look like :
{
    "entities": [{"label":"Person","id":string,"name":string}],
    "relationships": ["projectid|HAS_PEOPLE|personid"]
}

Case Sheet:
$ctext
"""


# Prompt for processing slack messages

slack_prompt_template = """
From the list of messages below, extract the following Entities & relationships described in the mentioned format
0. ALWAYS FINISH THE OUTPUT. Never send partial responses
1. First, look for these Entity types in the text and generate as comma-separated format similar to entity type.
   `id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. Do not create new entity types that aren't mentioned below. You will have to generate as many entities as needed as per the types below:
    Entity Types:
    label:'Person',id:string,name:string //Person that sent the message. `id` property is the name of the person, in camel-case; for example, "michaelClark", or "emmaMartinez"; 'name' is the person's name, as spelled in the text.
    label:'SlackMessage',id:string,text:string //The Slack-Message that was sent; 'id' property should be the message id, as spelled in the reference. 'text' property is the text content of the message, as spelled in the reference

3. Next generate each relationships as triples of head, relationship and tail. To refer the head and tail entity, use their respective `id` property. Relationship property should be mentioned within brackets as comma-separated. They should follow these relationship types below. You will have to generate as many relationships as needed as defined below:
    Relationship types:
    personid|SENT|slackmessageid

The output should look like :
{
    "entities": [{"label":"SlackMessage","id":string,"text":string}],
    "relationships": ["personid|SENT|messageid"]
}

Case Sheet:
$ctext
"""

### 4. Running the pipeline


In [None]:
folders = {
    "people_profiles": people_prompt_template,
    "project_briefs": project_prompt_template,
    "slack_messages": slack_prompt_template,
}

ingestion_pipeline(folders)