## Setup

Run in terminal:
pip install typing-extensions<4.6.0
pip install pillow<10.1.0,>=8.3.2
pip install fastapi kaleido uvicorn
pip install langchain
pip install pypdf
pip install unstructured
pip install yachalk
pip install "unstructured[pdf]"
pip install openai
sudo apt update
sudo apt-get install libgl1-mesa-glx

pip install --upgrade jupyter ipywidgets

pip install --upgrade opencv-python-headless

export OPENAI_API_KEY="..."


In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "coin"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Dir PDF Loader
loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
#loader = PyPDFLoader("./data_input/coin/coinbase.pdf")
#loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)

Number of chunks =  485
02/11/2023, 20:58 coin-20230930
https://www.sec.gov/Archives/edgar/data/1679788/000167978823000113/coin-20230930.htm 3/167Table of Contents
TABLE OF CONTENTS
Page
Part I - Financial Information
Item 1. Financial Statements 6
Condensed Consolidated Balance Sheets 6
Condensed Consolidated Statements of Operations 7
Condensed Consolidated Statements of Comprehensive Loss 8
Condensed Consolidated Statements of Changes in Stockholders' Equity 9
Condensed Consolidated Statements of Cash Flows 11
Notes to Condensed Consolidated Financial Statements 13
Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations 53
Item 3. Quantitative and Qualitative Disclosures About Market Risk 76
Item 4. Controls and Procedures 80
Part II - Other Information
Item 1. Legal Proceedings 81
Item 1A. Risk Factors 82
Item 2. Unregistered Sales of Equity Securities, Use of Proceeds, and Issuer Purchases of Equity
Securities 159
Item 3. Defaults Upon Senior 

## Create a dataframe of all the chunks

In [2]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(485, 4)


Unnamed: 0,text,source,page,chunk_id
0,"02/11/2023, 20:58 coin-20230930\nhttps://www.s...",data_input/coin/coinbase.pdf,0,83cb69f1f13743eba09a15a56d62d11d
1,of 1934 during the preceding 12 months (or for...,data_input/coin/coinbase.pdf,0,b21d91b93aff4054956c2b2884b94812
2,"02/11/2023, 20:58 coin-20230930\nhttps://www.s...",data_input/coin/coinbase.pdf,1,5acd2ecba30748f8a0983906bb2a2573
3,"02/11/2023, 20:58 coin-20230930\nhttps://www.s...",data_input/coin/coinbase.pdf,2,2c5aef0259384528989389d3d2ad63c2
4,"02/11/2023, 20:58 coin-20230930\nhttps://www.s...",data_input/coin/coinbase.pdf,3,eeed85595b9941bb8570b945c1645bf7


## Extract Concepts

In [3]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

In [6]:
df.to_csv("df.csv", sep="|", index=False)

In [None]:
from openai import OpenAI

client = OpenAI(
  organization='org-A6mbTbr0FP5rFIEvwzMViNHR',
)

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who won the world series in 2020?"},
    {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
    {"role": "user", "content": "Where was it played?"}
  ]
)

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [4]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

[
   {
       "node_1": "Coinbase Global, Inc.",
       "node_2": "Securities Exchange Act of 1934",
       "edge": "Coinbase Global, Inc. Is required to file reports with the Securities and Exchange Commission (SEC) in compliance with Section 13 or 15(d) of the Securities Exchange Act of 1934 for the preceding 12 months."
   },
   {
       "node_1": "COIN",
       "node_2": "The Nasdaq Stock Market LLC",
       "edge": "COIN, the trading symbol for Coinbase Global, Inc.'s Class A common stock, is registered on The Nasdaq Stock Market LLC."
   },
   {
       "node_1": "Coinbase Global, Inc.",
       "node_2": "Class A common stock",
       "edge": "Coinbase Global, Inc. Has an outstanding class of common stock with a par value per share of $0.00001."
   },
   {
       "node_1": "Coinbase Global, Inc.",
       "node_2": "Delaware",
       "edge": "Coinbase Global, Inc. Is incorporated in Delaware with a Delaware Certificate of Incorporation numbered 46-4707224."
   },
   {
       "node_

KeyboardInterrupt: 

## Calculating contextual proximity

In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

### Merge both the dataframes

In [None]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

## Calculate the NetworkX Graph

In [None]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

### Create a dataframe for community colors

In [None]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

### Add colors to the graph

In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)