In [1]:
import json

## Load and Reformat ACI318-19 data
* This data will be used to create our knowledge graph

In [2]:
with open("data/ACI318-19_json/ACI318-19_complete_v2.json","r") as f:
    data = json.load(f)

In [3]:
"""
json schema consists of chapter keys containing a 
- chapter entry
- list of section entries
- list of item entries
"""
for k,v in data.items():
    print(k)
print(data["ch10"])

ch10
ch26
ch27
ch2
ch11
ch9
ch20
ch16
ch5
ch4
ch17
ch21
ch8
ch22
ch18
ch7
ch14
ch15
ch6
ch19
ch23
ch1
ch12
ch24
ch25
ch13
{'chapter': {'content': 'Chapter 10 Columns', 'id': '10', 'type': 'chapter'}, 'sections': [{'content': '10.1 Scope', 'id': '10.1', 'type': 'section'}, {'content': '10.2 General', 'id': '10.2', 'type': 'section'}, {'content': '10.2.1 Materials', 'id': '10.2.1', 'type': 'section'}, {'content': '10.2.2 Connection to Other Members', 'id': '10.2.2', 'type': 'section'}, {'content': '10.3 Design Limits', 'id': '10.3', 'type': 'section'}, {'content': '10.3.1 Dimensional Limits', 'id': '10.3.1', 'type': 'section'}, {'content': '10.4 Required Strength', 'id': '10.4', 'type': 'section'}, {'content': '10.4.1 General', 'id': '10.4.1', 'type': 'section'}, {'content': '10.4.2 Factored Axial Force and Moment', 'id': '10.4.2', 'type': 'section'}, {'content': '10.5 Design Strength', 'id': '10.5', 'type': 'section'}, {'content': '10.5.1 General', 'id': '10.5.1', 'type': 'section'}, {'

In [4]:
#item entry
data["ch12"]["items"][1]

# items contain references to other items or headers

{'type': 'item',
 'id': '12.1.2',
 'content': 'Diaphragms in structures assigned to Seismic Design Category D, E, or F shall also satisfy requirements of 18.12 .',
 'item_references': ['18.12'],
 'chapter_references': []}

In [5]:
#section entry
print(data["ch12"]["sections"][0])

{'content': '12.1 Scope', 'id': '12.1', 'type': 'section'}


In [6]:
#chapter entry
print(data["ch12"]["chapter"])

{'content': 'Chapter 12 Diaphragms', 'id': '12', 'type': 'chapter'}


In [7]:
# lets consolidate chapters, sections, and items
nodes = {"chapters": [], "sections": [], "items": []}
for val in data.values():
    nodes["chapters"].append(val["chapter"])
    nodes["sections"].extend(val["sections"])
    nodes["items"].extend(val["items"])

print(nodes)

{'chapters': [{'content': 'Chapter 10 Columns', 'id': '10', 'type': 'chapter'}, {'content': 'Chapter 26 Construction Documents and Inspection', 'id': '26', 'type': 'chapter'}, {'content': 'Chapter 27 Strength Evaluation of Existing Structures', 'id': '27', 'type': 'chapter'}, {'content': 'Chapter 2 Notation and Terminology', 'id': '2', 'type': 'chapter'}, {'content': 'Chapter 11 Walls', 'id': '11', 'type': 'chapter'}, {'content': 'Chapter 9 Beams', 'id': '9', 'type': 'chapter'}, {'content': 'Chapter 20 Steel Reinforcement Properties, Durability, & Embedments', 'id': '20', 'type': 'chapter'}, {'content': 'Chapter 16 Connections Between Members', 'id': '16', 'type': 'chapter'}, {'content': 'Chapter 5 Loads', 'id': '5', 'type': 'chapter'}, {'content': 'Chapter 4 Structural System Requirements', 'id': '4', 'type': 'chapter'}, {'content': 'Chapter 17 Anchoring to Concrete', 'id': '17', 'type': 'chapter'}, {'content': 'Chapter 21 Strength Reduction Factors', 'id': '21', 'type': 'chapter'}, {

In [8]:
# quick sanity check; 
print(f"Original Number of Chapters: {len(data)}")
print(f"Number of Chapters After Reformat: {len(nodes['chapters'])}")
assert len(data) == len(nodes["chapters"])

Original Number of Chapters: 26
Number of Chapters After Reformat: 26


## Knowledge Graph
### To Do


#### Input Nodes + Relationships into Neo4j
After nodes are validated we will upload the nodes into the Neo4j database using the Cypher query language. 

### Knowledge Graph Diagram
<img src="images/graph_repr_small.png" alt="Knowledge Graph Diagram" />





## Validate Nodes
As shown above, each item is apart of a section which can be apart of other sections which is then linked to a chapter and finally to the ACI318 document. (item1.2.1 -> section:1.2-> chapter1-> document) We must ensure that the required section nodes exist such that each item can traverse upwards to the document and if the necessary section nodes do not exist in our scraped json, we must create them to be implemented into the Neo4j database.  

This is crucial because items can reference other items, sections, or even chapters. If an item references a section but the section does not exist in our knowledge graph, we will not be able to transverse this path to that section's items during inference. 

Therefore, in this section we will:
* check if upstream nodes exist for each item
* check if nodes that items reference exist
* add missing nodes

In [9]:
# get all item IDs
item_ids = [item["id"] for item in nodes['items']]
num_items = len(item_ids)
print(f"Number item IDs: {num_items}")
print(f"Average number item IDs per Chapter: {round(num_items/len(nodes['chapters']),1)}")
print(item_ids[:10])

Number item IDs: 1828
Average number item IDs per Chapter: 70.3
['10.1.1', '10.1.2', '10.2.1.1', '10.2.1.2', '10.2.1.3', '10.2.2.1', '10.2.2.2', '10.2.2.3', '10.3.1.1', '10.3.1.2']


In [10]:
## Basic Data exploration
# find maximum number of references
item_refs = [item["item_references"]for item in nodes['items']]
item_refs_flat = [ref for ref_list in item_refs for ref in ref_list]
item_refs_lengths = [len(ref_list) for ref_list in item_refs]

print(f"item_refs: {item_refs[:10]}")
print(f"item_refs_flat: {item_refs_flat[:10]}\n")
num_item_refs = len(item_refs_flat)
print(f"Total Number of Item References: {num_item_refs}")
print(f"Average number of Item References per Item: {round(num_item_refs/num_items,1)}")
print(f"Max number of Item References: {max(item_refs_lengths)}")

item_refs: [[], [], [], [], ['20.6'], [], ['16.2'], ['16.3'], [], []]
item_refs_flat: ['20.6', '16.2', '16.3', '1.5', '10.3.1.1', '10.3.1.4', '21.2', '22.4', '22.5', '22.7']

Total Number of Item References: 1729
Average number of Item References per Item: 0.9
Max number of Item References: 10


### Validate Upstream Nodes which have PART_OF relationship to Items

In [11]:
def extract_upstream_ids(id):
    if "." in id:
        section = id[:id.rindex(".")]
        return [section] + extract_upstream_ids(section)
    else: 
        return []
    
def find_missing_ids(req_upstream_ids, existing_upstream_ids):
    missing_ids = []
    for req_id in req_upstream_ids:
        if req_id not in existing_upstream_ids:
            missing_ids.append(req_id)
    return missing_ids
    
    

In [12]:
# find upstream IDs required by items
req_upstream_ids = []
for item in nodes["items"]:
    for upstream_id in extract_upstream_ids(item["id"]):
        req_upstream_ids.append(upstream_id)
        
# ensure upstream ids are unique
req_upstream_ids = list(set(req_upstream_ids))
print(f"{req_upstream_ids[:10]=}")
print(f"Number required upstream ids: {len(req_upstream_ids)}")

# find existing IDs
existing_ids = [section["id"] for section in nodes["sections"]] + [chapter["id"] for chapter in nodes["chapters"]] + [item["id"] for item in nodes["items"]]
print(len([section["id"] for section in nodes["sections"]]))
print(len([section["id"] for section in nodes["chapters"]]))
print(f"{existing_ids[:10]=}")
print(f"Number existing ids: {len(existing_ids)}")


req_upstream_ids[:10]=['8.5.2', '10.5.4', '18.4.4', '22.6', '25.9.1', '23.6', '24.2.4', '17.7', '26.4', '8.7']
Number required upstream ids: 797
714
26
existing_ids[:10]=['10.1', '10.2', '10.2.1', '10.2.2', '10.3', '10.3.1', '10.4', '10.4.1', '10.4.2', '10.5']
Number existing ids: 2568


In [13]:
missing_ids = find_missing_ids(req_upstream_ids, existing_ids)
print(f"{missing_ids=}")
print(len(missing_ids))

missing_ids=['14.1.4']
1


In [14]:
# manually add the missing section
new_node = {"id": "14.1.4",
            "type":"section",
            "content": "14.1.4 Plain Concrete in Structures Assigned to Seismic Design Category C, D, E or F"}
nodes["sections"].append(new_node)

In [15]:
# check if there are any missing nodes now
existing_ids = [section["id"] for section in nodes["sections"]] + [chapter["id"] for chapter in nodes["chapters"]] + [item["id"] for item in nodes["items"]]

missing_ids = find_missing_ids(req_upstream_ids, existing_ids)
print(f"{missing_ids=}")
print(len(missing_ids))

missing_ids=[]
0


### Validate Reference Nodes which have REFERENCES relationship to items

In [16]:
# check that all nodes referenced exist
typ_refs = []
chapter_refs = []
for item in nodes["items"]:
    for ref_id in item["item_references"]:
        typ_refs.append(ref_id)
    for ref_id in item["chapter_references"]:
        chapter_refs.append(ref_id)

typ_refs = list(set(typ_refs))
chapter_refs = list(set(chapter_refs))
print(f"{typ_refs[:10]=}")
print(f"{len(typ_refs)}")
print(f"{chapter_refs[:10]=}")
print(f"{len(chapter_refs)}")

req_ref_ids = typ_refs + chapter_refs

typ_refs[:10]=['22.6.7.1', '8.8.1.6', '10.3.2.3', '20.5.1.3.3', '22.6', '18.9.2.2', '24.2.4', '17.7', '17.6.2.4.1', '26.4']
811
chapter_refs[:10]=['8', '10', '13', '9', '6', '25', '27', '26', '22', '5']
23


In [17]:
missing_ids = find_missing_ids(req_ref_ids, existing_ids)
print(f"{missing_ids=}")
print(len(missing_ids))

missing_ids=['10.3.2.3', '5.8.4.4.2', '1.73', '3.3.1', '1.65', '12.14', '1.15', '18.6.5.1', '3.25', '5.8.4.4.3', '4.2.3', '1.09', '5.8.4.5', '1.25', '2.4', '2.5', '3.5', '5.5', '1.17', '12.11', '3']
21


In [18]:
# after going through the missing IDs, it is apparent that most of them must be referencing other documents and do not apply. 
# However, a few are missing, so we will manually add them now
new_node = {"id": "18.6.5.1",
            "type":"item",
            "item_references": [],
            "content": "The design shear force Ve shall be calculated from consideration of the forces on the portion of the beam between faces of the joints. It shall be assumed that moments of opposite sign corresponding to probable flexural strength, Mpr, act at the joint faces and that the beam is loaded with the factored gravity and vertical earthquake loads along its span."}
nodes["items"].append(new_node)

new_node = {"id": "18.6.5.2",
            "type":"item",
            "item_references": ["18.6.4.1", "18.6.5.1"],
            "content": "Transverse reinforcement over the lengths identified in 18.6.4.1 shall be designed to resist shear assuming Vc = 0 when both (a) and (b) occur: (a) The earthquake-induced shear force calculated in accordance with 18.6.5.1 represents at least one-half of the maximum required shear strength within those lengths.(b) The factored axial compressive force Pu including earthquake effects is less than Agfc'/20."}
nodes["items"].append(new_node)

# remove the ids from missing_ids
missing_ids.remove("18.6.5.1")
# remove 3 since Chapter 3 was purposely excluded from our data
missing_ids.remove("3")

In [19]:
# lets remove the references in the missing_ids list from our nodes. These references are erroneous and do not exist in our document
def remove_references(ref_to_remove,nodes: dict):
    for ref in ref_to_remove:
        for item in nodes["items"]:
            if item['item_references'] == ref:
                nodes["items"] = nodes["items"].remove(item)
                item["item_references"] = item["item_references"].remove(ref)
                nodes["items"] = nodes["items"].append(item)
    return nodes


print(f"Item Length Before remove false refs: {len(nodes['items'])}")
nodes = remove_references(missing_ids, nodes)
print(f"Item Length After remove false refs: {len(nodes['items'])}")

Item Length Before remove false refs: 1830
Item Length After remove false refs: 1830


In [20]:
# save the new validated nodes data for future use
with open("data/ACI318-19_json/ACI318-19_nodes_format_validated.json","w") as f:
    json.dump(nodes,f)

In [21]:
with open("data/ACI318-19_json/ACI318-19_nodes_format_validated.json","r") as f:
    data = json.load(f)
    print(data)
    for k,v in data.items():
        print(k)

{'chapters': [{'content': 'Chapter 10 Columns', 'id': '10', 'type': 'chapter'}, {'content': 'Chapter 26 Construction Documents and Inspection', 'id': '26', 'type': 'chapter'}, {'content': 'Chapter 27 Strength Evaluation of Existing Structures', 'id': '27', 'type': 'chapter'}, {'content': 'Chapter 2 Notation and Terminology', 'id': '2', 'type': 'chapter'}, {'content': 'Chapter 11 Walls', 'id': '11', 'type': 'chapter'}, {'content': 'Chapter 9 Beams', 'id': '9', 'type': 'chapter'}, {'content': 'Chapter 20 Steel Reinforcement Properties, Durability, & Embedments', 'id': '20', 'type': 'chapter'}, {'content': 'Chapter 16 Connections Between Members', 'id': '16', 'type': 'chapter'}, {'content': 'Chapter 5 Loads', 'id': '5', 'type': 'chapter'}, {'content': 'Chapter 4 Structural System Requirements', 'id': '4', 'type': 'chapter'}, {'content': 'Chapter 17 Anchoring to Concrete', 'id': '17', 'type': 'chapter'}, {'content': 'Chapter 21 Strength Reduction Factors', 'id': '21', 'type': 'chapter'}, {

In [22]:
# use to query ndoes for specific ids
nodes_flat = nodes["chapters"]+ nodes["sections"] + nodes["items"]
def locate_node_by_id(id: str, nodes_flat)-> dict:
    for node in nodes_flat:
        if node["id"] == id:
            return node


locate_node_by_id("18.6.4.1", nodes_flat)

{'type': 'item',
 'id': '18.6.4.1',
 'content': 'Hoops shall be provided in the following regions of a beam : (a) Over a length equal to twice the beam depth measured from the face of the supporting column toward midspan, at both ends of the beam  (b) Over lengths equal to twice the beam depth on both sides of a section where flexural yielding is likely to occur as a result of lateral displacements beyond the elastic range of behavior.',
 'item_references': [],
 'chapter_references': []}

## Knowledge Graph Assembly
Data has been validated. Now it is time to add our nodes and relationships 

In [111]:
# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI

In [171]:
# Load from environment
from dotenv import load_dotenv
import os

load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'


In [173]:

# connect to graph using langchain
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD
)


## Add Item Nodes

In [26]:
# Cypher query to add item node
merge_item_node_query = """
MERGE(i:Item {id: $itemParam.id})
    ON CREATE SET 
        i.type = $itemParam.type,
        i.text = $itemParam.content, 
        i.item_refs = $itemParam.item_references, 
        i.chapter_refs = $itemParam.chapter_references
RETURN i
"""

In [27]:
# add a single item for now
kg.query(merge_item_node_query, 
         params={'itemParam': nodes["items"][0]})

[{'i': {'item_refs': [],
   'chapter_refs': [],
   'id': '10.1.1',
   'type': 'item',
   'content': 'This chapter shall apply to the design of nonprestressed and prestressed columns , including reinforced concrete  pedestals .'}}]

In [28]:
# create a uniqueness constraint to avoid duplicate chunks
kg.query("""
CREATE CONSTRAINT unique_item IF NOT EXISTS 
    FOR (i:Item) REQUIRE i.id IS UNIQUE
""")


[]

In [29]:
kg.query("SHOW INDEXES")

[{'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 2,
  'name': 'unique_item',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'RANGE',
  'entityType': 'NODE',
  'labelsOrTypes': ['Item'],
  'properties': ['id'],
  'indexProvider': 'range-1.0',
  'owningConstraint': 'unique_item',
  'lastRead': None,
  'readCount': None}]

In [30]:
for i,item in enumerate(nodes["items"]):
    kg.query(merge_item_node_query, 
         params={'itemParam': item})


print(f"Total Item Nodes created: {i+1}")

Total Item Nodes created: 1830


In [31]:
kg.query("""
         MATCH (i)
         RETURN count(i) as nodeCount
         """)

[{'nodeCount': 1830}]

In [40]:
# # rename .content property to .text
# change_content_to_text_query = """
# MATCH (i:Item)
# SET i.text = i.content
# REMOVE i.content
# """

# kg.query(change_content_to_text_query)


[]

## Create Vector Index for Items

In [32]:
kg.query("""
         CREATE VECTOR INDEX `ACI318-19_items` IF NOT EXISTS
          FOR (i:Item) ON (i.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")


[]

In [33]:
VECTOR_INDEX_NAME = "ACI318-19_items"
kg.query("SHOW INDEXES")


[{'id': 4,
  'name': 'ACI318-19_items',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Item'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': None},
 {'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 2,
  'name': 'unique_item',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'RANGE',
  'entityT

In [34]:
# # remove textEMbedding bc we did it wrong first time (':
# remove_textEmbedding_query = """
# MATCH (i:Item)
# REMOVE i.textEmbedding
# """
# delete_nodes_query = """
# MATCH (i)
# DELETE i
# """
# kg.query(delete_nodes_query)
# kg.refresh_schema()
# print(kg.schema)

### Calculate Embedding Vector for Chunks and Populate Index

In [35]:
kg.query("""
    MATCH (i:Item) WHERE i.textEmbedding IS NULL
    WITH i, genai.vector.encode(
      i.text, 
      "OpenAI", 
      {
        token: $openAiApiKey, 
        endpoint: $openAiEndpoint
      }) AS vector
    CALL db.create.setNodeVectorProperty(i, "textEmbedding", vector)
    """, 
    params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

[]

In [41]:
kg.refresh_schema()
print(kg.schema)

Node properties are the following:
Item {id: STRING, type: STRING, item_refs: LIST, chapter_refs: LIST, textEmbedding: LIST, text: STRING}
Relationship properties are the following:

The relationships are the following:



In [37]:
return_1_query = """
MATCH (i)
RETURN (i.textEmbedding)
LIMIT 1
"""
embed = kg.query(return_1_query)
print(embed[0]['(i.textEmbedding)'])
print(len(embed[0]['(i.textEmbedding)']))

[0.001031701685860753, 0.018120311200618744, -0.020472725853323936, -0.02244875393807888, 0.03667077794671059, 0.011930101551115513, -0.030944330617785454, -0.028282741084694862, 0.0001744357286952436, -0.0007628543535247445, 0.004896382335573435, 0.0033185845240950584, -0.00321776675991714, 0.0038478777278214693, -0.0002289403200848028, -0.01922258548438549, 0.012306488119065762, 0.016789516434073448, 0.02558082528412342, -0.006378403399139643, 0.01970651187002659, -0.00718494551256299, -0.011614205315709114, -0.021413691341876984, -0.006217095069587231, 0.0017827940173447132, 0.036939624696969986, -0.030621713027358055, -0.007494119927287102, -0.005423995200544596, -0.0015551139367744327, -0.009140809997916222, 0.0033908372279256582, -0.033632803708314896, -0.01113700121641159, -0.033579032868146896, 0.007910833694040775, 0.0004112524329684675, 0.023631682619452477, 0.013180241920053959, 0.03492327034473419, -0.030164673924446106, -0.009772601537406445, -0.029868941754102707, 0.03503

## Use Similarity Search to Find Relevant Chunks

In [42]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text
  """
  
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'openAiEndpoint': OPENAI_ENDPOINT,
                      'index_name':VECTOR_INDEX_NAME, 
                      'top_k': 10})
  return similar

In [43]:
print(neo4j_vector_search("What is the number of rebar needed in concrete"))

[{'score': 0.9238264560699463, 'text': 'Nonprestressed cast-in-place concrete members shall have specified concrete cover for reinforcement at least that given in Table 20.5.1.3.1.  Table 20.5.1.3.1— Specified concrete cover for cast-in-place nonprestressed concrete members     Concrete exposure  Member  Reinforcement  Specified cover, in.      Cast against and permanently in contact with ground  All  All  3    Exposed to weather or in contact with ground  All  No. 6 through No. 18 bars  2    No. 5 bar, W31 or D31 wire, and smaller  1- 1 / 2    Not exposed to weather or in contact with ground  Slabs, joists, and walls  No. 14 and No. 18 bars  1- 1 / 2    No. 11 bar and smaller  3 / 4    Beams , columns , pedestals , and tension ties  Primary reinforcement , stirrups , ties, spirals, and hoops  1- 1 / 2    '}, {'score': 0.9211043119430542, 'text': 'If the concrete cover outside the confining transverse reinforcement required by 18.7.5.1 , 18.7.5.5 , and 18.7.5.6 exceeds 4 in., additiona

## Add Section Nodes

# Cypher query to add item node

In [44]:
nodes["sections"][0]

{'content': '10.1 Scope', 'id': '10.1', 'type': 'section'}

In [50]:
merge_section_node_query = """
MERGE(sec:Section {id: $secParam.id})
    ON CREATE SET 
        sec.type = $secParam.type,
        sec.text = $secParam.content
RETURN sec
"""

In [51]:
# add a single item for now
kg.query(merge_section_node_query, 
         params={'secParam': nodes["sections"][0]})

[{'sec': {'id': '10.1', 'text': '10.1 Scope', 'type': 'section'}}]

In [52]:
# create a uniqueness constraint to avoid duplicate chunks
kg.query("""
CREATE CONSTRAINT unique_section IF NOT EXISTS 
    FOR (sec:Section) REQUIRE sec.id IS UNIQUE
""")

kg.query("SHOW INDEXES")

[{'id': 4,
  'name': 'ACI318-19_items',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Item'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 3, 21, 0, 5, 28, 291000000, tzinfo=<UTC>),
  'readCount': 2},
 {'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 3, 21, 0, 4, 58, 644000000, tzinfo=<UTC>),
  'readCount': 8},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount

In [55]:
for i,section in enumerate(nodes["sections"]):
    kg.query(merge_section_node_query, 
         params={'secParam': section})


print(f"Total Section Nodes created: {i+1}")

Total Section Nodes created: 715


Note: we will not create a vector index for the sections because htey contain little text. The goal is to traverse from the references to sections and then down to the items which contain the most information. 

Vector representations can be added later if needed.

## Add Chapter Nodes

In [53]:
nodes["chapters"][0]

{'content': 'Chapter 10 Columns', 'id': '10', 'type': 'chapter'}

In [56]:
merge_chapter_node_query = """
MERGE(ch:Chapter {id: $chParam.id})
    ON CREATE SET 
        ch.type = $chParam.type,
        ch.text = $chParam.content
RETURN ch
"""

In [57]:
# add a single item for now
kg.query(merge_chapter_node_query, 
         params={'chParam': nodes["chapters"][0]})

[{'ch': {'id': '10', 'text': 'Chapter 10 Columns', 'type': 'chapter'}}]

In [60]:
for i,chapter in enumerate(nodes["chapters"]):
    kg.query(merge_chapter_node_query, 
         params={'chParam': chapter})


print(f"Total Chapter Nodes created: {i+1}")

Total Chapter Nodes created: 26


Note: we will not create a vector index for the sections because htey contain little text. The goal is to traverse from the references to sections and then down to the items which contain the most information. 

Vector representations can be added later if needed.

## Add PART_OF Relationships

In [62]:
nodes["items"][0]

{'type': 'item',
 'id': '10.1.1',
 'content': 'This chapter shall apply to the design of nonprestressed and prestressed columns , including reinforced concrete  pedestals .',
 'item_references': [],
 'chapter_references': []}

In [85]:
# delete all PART_OF relationships
cypher = """
MATCH ()-[r:PART_OF]->()
DELETE r
"""
kg.query(cypher)

[]

In [91]:
# get upstream ids
add_PART_OF_relationships_cypher = """
MATCH (n)
WHERE (n.id) IN $upstream_ids
WITH (n)
    ORDER BY n.id DESC
WITH collect(n) AS upstream_nodes_list
    CALL apoc.nodes.link(
    upstream_nodes_list,
    "PART_OF",
    {avoidDuplicates: true}
    )
RETURN size(upstream_nodes_list)
"""

node_id = nodes["items"][0]["id"]
kg.query(cypher, params={"upstream_ids": extract_upstream_ids(node_id)+ [node_id]})

[{'size(upstream_nodes_list)': 3}]

In [90]:
# check PART_OF relationships for node
def find_node_by_id(id):
    find_node_by_id_cypher = """
    MATCH (n)-[:PART_OF]->(n1)-[:PART_OF]->(n2)
    WHERE (n.id) = $id
    RETURN n.id,n1.id,n2.id
    """
    return kg.query(find_node_by_id_cypher, params={'id': id})

find_node_by_id("10.1.1")

[{'n.id': '10.1.1', 'n1.id': '10.1', 'n2.id': '10'}]

In [92]:
# PART_OF relationship was successful for item 10.1.1
# lets add PART_OF relationships for remaining items
for item in nodes["items"]:
    kg.query(add_PART_OF_relationships_cypher, params={"upstream_ids": extract_upstream_ids(item["id"])+ [item["id"]]})

## Add REFERENCES Relationships

In [101]:
print(nodes["items"][12])

{'type': 'item', 'id': '10.3.1.5', 'content': 'If a reduced effective area is considered according to 10.3.1.1 through 10.3.1.4 , structural analysis and design of other parts of the structure that interact with the column shall be based on the actual cross section.', 'item_references': ['10.3.1.1', '10.3.1.4'], 'chapter_references': []}


In [104]:
item_refs = nodes["items"][12]['item_references']
item_id = nodes["items"][12]["id"]
print(item_id)
print(item_refs)

10.3.1.5
['10.3.1.1', '10.3.1.4']


In [106]:
add_ref_cypher = """
UNWIND $ref_IDlist AS refID
MERGE (item:Item {id: $item_id})
WITH item, refID
MATCH (ref_node {id: refID})
MERGE (item)-[:REFERENCES]->(ref_node)
"""
kg.query(add_ref_cypher, params={"item_id": item_id, "ref_IDlist": item_refs})

[]

In [107]:
# add reference relationships for remaining item nodes
for item in nodes["items"]:
    kg.query(add_ref_cypher, params={"item_id": item["id"], "ref_IDlist": item["item_references"]})


References were successfully added. However, some references reference themselves as seen below. This is unhelpful for graph search and will slow down searches. Lets remove self-reference relationships.

<img src="images/self_reference_example.png" alt="Self Reference Example" />

In [109]:
remove_self_reference_relationship_cypher = """
MATCH (i:Item) - [self_ref:REFERENCES] -> (ref_node:Item)
WHERE i.id = ref_node.id
DELETE self_ref
"""

kg.query(remove_self_reference_relationship_cypher)

[]

In [110]:
check_remove_self_ref = """
MATCH (i:Item) - [self_ref:REFERENCES] -> (ref_node:Item)
WHERE i.id = ref_node.id
RETURN i
"""
kg.query(check_remove_self_ref)

[]

## This concludes the creation of the Neo4j knowledge graph for ACI318-19
See <i> query_knlowedge_graph.ipynb </i> for usage with langchain