In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sycamore.data import Document, HierarchicalDocument
from sycamore.llms import OpenAI, OpenAIModels
from sycamore.connectors.file.file_scan import JsonManifestMetadataProvider
from sycamore.transforms.partition import ArynPartitioner
from sycamore.utils.time_trace import timetrace

import sycamore
import pickle
import json
import sys
import pandas as pd
import csv
import os
from pathlib import Path

from sycamore.utils.cache import DiskCache
ctx = sycamore.init() 
llm = OpenAI(OpenAIModels.GPT_4O, cache=DiskCache("/home/eric/sycamore-ritam/ritam-scripts/llm_cache"))



In [3]:
def unpickle_doc(pdoc: Document) -> list[Document]:
    doc = pickle.loads(pdoc.binary_representation)
    return [doc]

path = Path('ntsb_parsed_1').resolve()
pickled_docset = ctx.read.binary(str(path), binary_format="pickle")
ds = pickled_docset.flat_map(unpickle_doc)

In [4]:
from sycamore.transforms.extract_document_structure import StructureBySection, StructureByDocument
from sycamore.transforms.extract_graph_entities import EntityExtractor
from sycamore.transforms.extract_graph_relationships import RelationshipExtractor
from pydantic import BaseModel, Field
import boto3


class Report(BaseModel):
    ID: str = Field(description="This ID can be found under either Accident Number, Incident Number, or Occurance Number")

class Aircraft(BaseModel):
    registration: str
    make: str
    model: str

class INVOLVED_AIRCRAFT(BaseModel):
    start: Report
    end: Aircraft

ENTITY_DEFAULT_PROMPT = """
    -Instructions-
    You are a information extraction system.

    You will be given a sequence of data in different formats(text, table, Section-header) in order.
    Your job is to extract entities from the text input that match the entity schemas provided. Each entity
    and property extracted should directly reference part of the text input provided.

    """

RELATIONSHIP_DEFAULT_PROMPT = """
    -Goal-
    You are a helpful information extraction system.

    You will be given a sequence of data in different formats(text, table, Section-header) in order.
    Your job is to extract relationships that map between entities that have already been extracted from this text.

    """


ds_resolved = (
    ds.extract_document_structure(StructureBySection)
    .extract_graph_entities([EntityExtractor(llm=llm, entities=[Report, Aircraft], prompt=ENTITY_DEFAULT_PROMPT)])
    .extract_graph_relationships([RelationshipExtractor(llm=llm, relationships=[INVOLVED_AIRCRAFT], prompt=RELATIONSHIP_DEFAULT_PROMPT)])
    .materialize(path="/tmp/extract_graph", source_mode=sycamore.MATERIALIZE_USE_STORED)
    .resolve_graph_entities(resolvers=[], resolve_duplicates=False)
    .materialize(path="/tmp/resolve_entities")
    #.explode()
)

docs = ds_resolved.take_all()

#URI = "bolt://localhost:11001"
#AUTH = ("neo4j", "koala-stereo-comedy-spray-figure-6974")


# URI = 'neo4j+s://adc13803.databases.neo4j.io'
# AUTH = ('neo4j', 'bbOoM_jRSl9bfiN5EfRTam4QeJFTAWU34Jvv-6soQfM')
# DATABASE = "neo4j"
# IMPORT_DIR = "/home/ritam/neo4j/import"
# s3_session = boto3.session.Session()

# ds.write.neo4j(uri=URI,auth=AUTH,database=DATABASE,import_dir=IMPORT_DIR, use_auradb=True, s3_session=s3_session)

2024-09-08 15:58:48,233	INFO worker.py:1740 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
2024-09-08 15:58:49,866	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-08_15-58-44_811123_1962887/logs/ray-data
2024-09-08 15:58:49,867	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> TaskPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(unpickle_doc)->MapBatches(extract)->MapBatches(extract)->MapBatches(extract)->MapBatches(materialize)->MapBatches(_aggregate_section_nodes)]


- ReadBinary->SplitBlocks(32) 1:   0%|                                                               | 0/1 [00…

- Map(BinaryScan._to_document)->MapBatches(unpickle_doc)->MapBatches(extract)->MapBatches(extract)->MapBatches…

Running 0:   0%|                                                                                     | 0/1 [00…

(Map(BinaryScan._to_document)->MapBatches(unpickle_doc)->MapBatches(extract)->MapBatches(extract)->MapBatches(extract)->MapBatches(materialize)->MapBatches(_aggregate_section_nodes) pid=1963156) Unrecognized extenstion pickle; using application/pickle
I0000 00:00:1725836336.470664 1962887 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
2024-09-08 15:58:56,518	INFO dataset.py:2370 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2024-09-08 15:58:56,525	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-08_15-58-44_811123_1962887/logs/ray-data
2024-09-08 15:58:56,526	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input] -> AllToAllOperator[Aggregate] -> LimitOperato

- Aggregate 1:   0%|                                                                                | 0/32 [00…

Shuffle Map 2:   0%|                                                                                | 0/32 [00…

Shuffle Reduce 3:   0%|                                                                             | 0/32 [00…

- limit=1 4:   0%|                                                                                  | 0/32 [00…

Running 0:   0%|                                                                                    | 0/32 [00…

(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
2024-09-08 15:58:56,786	INFO streaming_executor.py:112 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-08_15-58-44_811123_1962887/logs/ray-data
2024-09-08 15:58:56,790	INFO streaming_executor.py:113 -- Execution plan of Dataset: InputDataBuffer[Input], InputDataBuffer[Input] -> UnionOperator[UnionOperator(Input, Input)] -> TaskPoolMapOperator[MapBatches(_clean_temp_nodes)->MapBatches(materialize)]
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD

- UnionOperator(Input, Input) 1:   0%|                                                              | 0/33 [00…

- MapBatches(_clean_temp_nodes)->MapBatches(materialize) 2:   0%|                                   | 0/33 [00…




Running 0:   0%|                                                                                    | 0/33 [00…

(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(map pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=1963152) ERROR:root:ERIC CHECK IF BAD
(reduce pid=196315

In [5]:
print(docs[0])
print("ERIC: WORKED TO HERE AND PASSED ALL THE CHECK IF BAD CHECKS")

{
  "doc_id": "ce46b2f0-664f-11ef-aab9-0698d584093b",
  "lineage_id": "27dd396b-4bec-4c56-859f-a08ec9bb31bf",
  "type": "pdf",
  "text_representation": null,
  "binary_representation": "<3281439 bytes>",
  "children": [
    "{\n  \"doc_id\": \"edac3606-5960-4c47-8411-c33100ea31ff\",\n  \"lineage_id\": \"f8b2aabe-9986-41a1-9f7b-b768851857c3\",\n  \"type\": \"Section-header\",\n  \"text_representation\": \"Front Page...\",\n  \"binary_representation\": \"<10 bytes>\",\n  \"children\": [\n    \"{\\n  \\\"doc_id\\\": \\\"076164b3-86d2-4a61-afee-a15582d4704b\\\",\\n  \\\"lineage_id\\\": \\\"e0d71820-ed3a-4d4f-bd1e-c9503f3691cb\\\",\\n  \\\"type\\\": \\\"Image\\\",\\n  \\\"text_representation\\\": \\\"AVIATION HIGHWAY MARINE PIPELINE\\\\n\\\\n...\\\",\\n  \\\"binary_representation\\\": \\\"<1340160 bytes>\\\",\\n  \\\"children\\\": [],\\n  \\\"embedding\\\": null,\\n  \\\"shingles\\\": null,\\n  \\\"parent_id\\\": null,\\n  \\\"bbox\\\": \\\"BoundingBox(0.07904860552619486, 0.001039825352755

In [7]:
print(sycamore.init.read.materialize("/tmp/resolve_entities").take(1))

AttributeError: 'function' object has no attribute 'read'

In [7]:
class Report(BaseModel):
    Title: str

class Drug(BaseModel):
    name:
    chemical_compisition:
    price:
    time_to_develop:


class Aircraft(BaseModel):
    registration: str
    make: str
    model: str

class INVOLVED_AIRCRAFT(BaseModel):
    start: Report
    end: Aircraft

SyntaxError: invalid syntax (1744755241.py, line 5)

In [5]:
for doc in docs:
    if "nodes" not in doc["properties"]:
        continue
    print(json.dumps(doc["properties"]["nodes"], indent=2))
    # for label, hashes in doc["properties"]["nodes"].items():
    #     print(label)
    #     for hash, node in hashes.items():
    #         uuid = set()
    #         rels = set()
    #         for rel_uuid, rel in node["relationships"].items():
    #             assert rel_uuid not in uuid
    #             assert json.dumps(rel) not in rels
    #             uuid.add(json.dumps(rel_uuid))
    #             rels.add(json.dumps(rel))
    #             print(rel_uuid)

In [8]:
for doc in docs:
    for section in doc.children:
        for label, hashes in section["properties"]["nodes"].items():
            for hash, node in hashes.items():
                uuid = set()
                rels = set()
                for rel_uuid, rel in node["relationships"].items():
                    assert rel_uuid not in uuid
                    assert json.dumps(rel) not in rels
                    uuid.add(json.dumps(rel_uuid))
                    rels.add(json.dumps(rel))
                    print(rel_uuid)

KeyError: 'nodes'

In [None]:
for doc in docs:
    for section in doc.children:
        if section.type == "extracted":
            print(json.dumps(section.data["relationships"], indent=2))

In [5]:
for doc in docs:
    for section in doc.children:
        if section.type == "extracted":
            print(json.dumps(section.data["relationships"], indent=2))
        #print(section)
        # if "EXTRACTED_NODES" in section["properties"]:
        #     for child in section.children:
        #         print(child)
        #         for rel_uuid, rel in child["relationships"].items():
        #             print(rel_uuid)
        #             print(rel)
        # if "nodes" not in section["properties"]:
        #     continue
        # for label, hashes in section["properties"]["nodes"].items():
        #     for hash, node in hashes.items():
        #         for rel_uuid, rel in node["relationships"].items():
        #             print(rel_uuid)
                    # print(rel)
    #for section in doc.children:
    #print(json.dumps(section["properties"]["nodes"], indent=2))

NameError: name 'docs' is not defined