## Setup

Run in terminal:
pip install typing-extensions<4.6.0
pip install pillow<10.1.0,>=8.3.2
pip install fastapi kaleido uvicorn
pip install langchain
pip install pypdf
pip install unstructured
pip install yachalk
pip install "unstructured[pdf]"
pip install openai
sudo apt update
sudo apt-get install libgl1-mesa-glx
pip install --upgrade jupyter ipywidgets
pip install --upgrade opencv-python-headless
export OPENAI_API_KEY="..."
pip install pyvis

In [1]:
import pandas as pd
import numpy as np
import os
import uuid
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random
from openai import OpenAI
import openai
import json
import time

## Input data directory
data_dir = "coin"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

In [None]:
org_id = os.getenv("OPENAI_ORG_ID") 
client = OpenAI(organization=org_id)

def graphPrompt(input: str, metadata={}, model="gpt-4-0125-preview"):
    openai.api_key = os.getenv("OPENAI_API_KEY") 


    # model_info = client.show(model_name=model)
    # print( chalk.blue(model_info))

    SYS_PROMPT = (
        "You are a network graph maker who extracts terms and their relations from a given context. "
        "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
        "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
            "\tTerms may include object, entity, location, organization, person, \n"
            "\tcondition, acronym, documents, service, concept, etc.\n"
            "\tTerms should be as atomistic as possible\n\n"
        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
            "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
            "\tTerms can be related to many other terms\n\n"
        "Thought 3: Find out the relation between each such related pair of terms. \n\n"
        "Format your output as a list of json. Each element of the list contains a pair of terms"
        "and the relation between them, like the follwing: \n"
        "[\n"
        "   {\n"
        '       "node_1": "A concept from extracted ontology",\n'
        '       "node_2": "A related concept from extracted ontology",\n'
        '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n'
        "   }, {...}\n"
        "]"
    )

    USER_PROMPT = f"context: ```{input}``` \n\n output: "
   # response, _ = client.generate(model_name=model, system=SYS_PROMPT, prompt=USER_PROMPT)
            
    response = client.chat.completions.create(
    model=model,
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ],
    max_tokens=500,
    stop="\t  \t  \t"
    )
    try:

        content = response.choices[0].message.content
        #print('content', content)
        
        # Parse the JSON string
        parsed_content = json.loads(content)

        # Handle single object or list
        if isinstance(parsed_content, dict):
            result = [dict(parsed_content, **metadata)]
        elif isinstance(parsed_content, list):
            result = [dict(item, **metadata) for item in parsed_content]
        else:
            raise ValueError("Unexpected data format in response")

        #print('result', result)
    except Exception as e:
        print("\n\nERROR ### Here is the errored response: ", response, "\n\nException: ", e)
        result = None
    print('graph prompt function success')
    time.sleep(1)
    return result

def df2Graph(dataframe: pd.DataFrame, model=None) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list


def graph2Df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

def append_df_to_csv(df, file_path):
    """
    Append a DataFrame to a CSV file. If the file does not exist, create it with a header.
    If the file exists, append without a header.

    :param df: DataFrame to be appended
    :param file_path: Path to the CSV file
    """
    # Check if file exists
    file_exists = os.path.isfile(file_path)
    time.sleep(1)
    # If file exists, append without header; otherwise, write with header
    if file_exists:
        df.to_csv(file_path, sep="|", mode='a', header=False, index=False)
    else:
        df.to_csv(file_path, sep="|", mode='w', header=True, index=False)

def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df


In [None]:
#WHEN RECREATING BASED ON NEW PDF RUN THIS:
## Dir PDF Loader
loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
#loader = PyPDFLoader("./data_input/coin/coinbase.pdf")
#loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)
pages = splitter.split_documents(documents)
df = documents2Dataframe(pages)
df.to_csv("df.csv", sep="|", index=False)
print(df.shape)
df.head()

## Extract Concepts

In [3]:
#WHEN USING PREPROCESSED PDF SAVED IN CSV RUN THIS
df = pd.read_csv('df.csv', sep="|") 
df.head()

Unnamed: 0,text,source,page,chunk_id
0,"In addition, statements that “we believe” and ...",data_input/coin/coinbase.pdf,4,685bcea8f090463682d7b4fef816d0d3
1,\n 6\n•Our net revenue may be concentrated in ...,data_input/coin/coinbase.pdf,5,6c11c774004842c3892f9cf89ce6868f
2,•Cyberattacks and security breaches of our pla...,data_input/coin/coinbase.pdf,5,be628a8b4ebe499baf37b2b128ef1863
3,"•We are, and may continue to be, subject to ma...",data_input/coin/coinbase.pdf,5,1888bf80179e4378863ff36950f3eba5
4,\n 7\n•We currently rely on third-party servic...,data_input/coin/coinbase.pdf,6,61c2ee4a35494431b324e8501d75e4dc


In [19]:

def dfgcsv(dataframe: pd.DataFrame, model="gpt-4-0125-preview") -> pd.DataFrame:
    chunk_size = 2 # Processing in chunks of 20 rows
    chunks = [dataframe[i:i + chunk_size] for i in range(0, dataframe.shape[0], chunk_size)]
    chunkstotal = len(chunks)
    print('chunks total of ', chunkstotal)
    chunk_concepts = []
    counter=0
    for chunk in chunks:
        results = pd.DataFrame()
        print(f'df2Graph started chunk #{counter}/{chunkstotal}')
        #chunk.reset_index(inplace=True)
        results = chunk.apply(lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model="gpt-4-0125-preview"), axis=1)
        # invalid json results in NaN
        results = results.dropna()
        results = results.reset_index(drop=True)
        ## Flatten the list of lists to one single list of entities.
        chunk_concepts = np.concatenate(results).ravel().tolist()
        graph_dataframe = pd.DataFrame(chunk_concepts).replace(" ", np.nan)
        graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
        graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
        graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())
        print(f'df2Graph finished chunk #{counter}/{chunkstotal}')
        append_df_to_csv(graph_dataframe, os.path.join(outputdirectory, "graph.csv"))
        append_df_to_csv(chunk, os.path.join(outputdirectory, "chunks.csv"))
        print(f'saved graph and chunk #{counter} csv files')
        #graph_dataframe.to_csv(os.path.join(outputdirectory, "graph.csv"), sep="|", mode='a', header=False, index=False)
        #chunk.to_csv(os.path.join(outputdirectory, "chunks.csv"), sep="|", mode='a', header=False, index=False)
        time.sleep(0.1)
        counter=counter+1
        chunk_concepts=[]

    msg = print(f'Success on {counter} chunks')
    return msg

dfgcsv(dataframe=df, model="gpt-4-0125-preview")

chunks total of  226
df2Graph started chunk #0/226
graph prompt function success


ERROR ### Here is the errored response:  ChatCompletion(id='chatcmpl-8nDrkxrC7Vq6ZlWtgp37sqeXBpZIK', choices=[Choice(finish_reason='length', index=0, logprobs=None, message=ChatCompletionMessage(content='   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \t    \n   \t       \

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [5]:
regenerate = True
if regenerate:
    dfgcsv(dataframe=df, model="gpt-4-0125-preview")

dfg1 = pd.read_csv(os.path.join(outputdirectory, "graph.csv"), sep="|")
dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(5, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,crypto assets,transaction,Crypto assets are relevant at the time of the ...,53bc890b95f44a468fd4092745665c70,4
1,updated arrangement,counterparty,The updated arrangement is made with the same ...,45ffdb5f262542c9ae95b8d8e071b5b6,4
2,"coinbase global, inc.",blockchain rewards,"Coinbase Global, Inc. records blockchain rewar...",56d7a6832cb748ffaf5cad1b01169bb9,4
3,digital wallet,company,is controlled by the Company,6415bc1cdcfa4df88541d06ca2285198,4
4,performance obligation,contracts,One performance obligation is typically provid...,0f7d1c65605a4d57ae4792f63c906205,4


## Calculating contextual proximity

In [8]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    dfg2["edge"] = "contextual proximity"
    return dfg2

def dropSingleEdges(dfg2: pd.DataFrame) -> pd.DataFrame:
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    # this is where there is an issue
    return dfg2

dfg2 = contextual_proximity(dfg1)
dfg2 = dropSingleEdges(dfg2)

dfg2

dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,count,edge
3,beliefs and opinions,statements,"685bcea8f090463682d7b4fef816d0d3,685bcea8f0904...",4,contextual proximity
12,"coinbase global, inc.",notes to condensed consolidated financial stat...,"26a28882728541699c444e7ab8d1cbfd,4650f0ea382f4...",2,contextual proximity
34,net revenue,transaction revenue,"6c11c774004842c3892f9cf89ce6868f,6c11c77400484...",4,contextual proximity
37,notes to condensed consolidated financial stat...,"coinbase global, inc.","26a28882728541699c444e7ab8d1cbfd,4650f0ea382f4...",2,contextual proximity
44,statements,beliefs and opinions,"685bcea8f090463682d7b4fef816d0d3,685bcea8f0904...",4,contextual proximity
48,transaction revenue,net revenue,"6c11c774004842c3892f9cf89ce6868f,6c11c77400484...",4,contextual proximity


## Calculate the NetworkX Graph

In [10]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()

import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))

import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)

for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [1]:
from pyvis.network import Network

graph_output_directory = "/workspaces/knowledge_graph/docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
#the next line may be commented out
net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
#the next line may be commented out
net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

NameError: name 'G' is not defined