This notebook allows visualization of a Sycamore Query as a pandas dataframe. Can see the state of the DocSet after each map function given a query ID and the directory of pickled DocSets. Options to view entire dataframe or a "summarized" version.

In [7]:
import os
import pickle
import pandas as pd

In [8]:
QUERY_ID = "eb7c6d5b-9a77-49b1-abf6-afb4fe5cc41a"
BASE_DIR = f"{os.path.dirname(os.getcwd())}/lib/sycamore/sycamore/query/luna_traces"
BASE_PROPS = set(
    [
        "filename",
        "filetype",
        "page_number",
        "page_numbers",
        "links",
        "element_id",
        "parent_id",
        "_schema",
        "_schema_class",
        "entity",
    ]
)
DISPLAY_ENTIRE_DF = False

In [9]:
# Iterate through NODE_IDS and collect properties from unpickled objects
for id in sorted(os.listdir(f"{BASE_DIR}/{QUERY_ID}")):

    # Initialize a list to hold the data
    data_list = []
    directory = f"{BASE_DIR}/{QUERY_ID}/{id}"

    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f):
            with open(f, "rb") as file:
                doc = pickle.load(file)
                doc_list = doc.properties["entity"]

                # print(doc.properties['LlmFilterOutput'])

                for p in doc.properties:
                    if p not in BASE_PROPS:
                        try:
                            doc_list[p] = doc.properties[p]
                        except:
                            doc_list[p] = None

                data_list.append(doc_list)

    df = pd.DataFrame(data_list)

    print(f"DOCSET AFTER NODE {id}:\n-")
    # print(f'{len(df['path'].unique())} unique paths\n-' )
    print(f"{len(df)} documents\n--------------------")

    if DISPLAY_ENTIRE_DF:
        with pd.option_context(
            "display.max_rows", None, "display.max_columns", None
        ):  # more options can be specified also
            display(df)

    else:
        display(df)

DOCSET AFTER NODE 1:
-
480 documents
--------------------


Unnamed: 0,location,dateAndTime,aircraft,flightConductedUnder,accidentNumber,registration,injuries,aircraftDamage,day,isoDateTime,path,_autogen_LlmFilterOutput
0,"Eagle River, Wisconsin","January 25, 2023, 13:00 Local",Cessna 120,Part 91: General aviation - Personal,CEN23LA098,N73034,1 None,Substantial,2023-01-25,2023-01-25T13:00:00Z,s3://aryn-public/ntsb/18.pdf,3
1,"Kenai, Alaska","January 2, 2023, 14:57 Local",Piper PA-18-150,Part 91: General aviation - Personal,ANC23LA011,N94X,1 Minor,Substantial,2023-01-02,2023-01-02T14:57:00Z,s3://aryn-public/ntsb/96.pdf,0
2,"Atlantic Ocean, Atlantic Ocean","January 3, 2023, 16:22 UTC",Cessna 340A,Part 91: General aviation - Personal,ERA23LA101,N824BC,4 None,Substantial,2023-01-03,2023-01-03T16:22:00+00:00Z,s3://aryn-public/ntsb/92.pdf,1
3,"Dallesport, Washington","January 29, 2023, 12:30 Local",CUB CRAFTERS INC CC18-180,Part 91: General aviation - Instructional,WPR23LA101,N127DT,2 None,Substantial,2023-01-29,2023-01-29T12:30:00Z,s3://aryn-public/ntsb/6.pdf,1
4,"Kingfisher, OK","January 16, 2023, Local",Piper PA-32-301T,Part 91: General aviation - Instructional,CEN23FA079,N8266D,2 Fatal,Destroyed,2023-01-16,2023-01-16T00:00:00Z,s3://aryn-public/ntsb/54.pdf,1
...,...,...,...,...,...,...,...,...,...,...,...,...
475,"Rich County, Utah","January 24, 2023, 17:45 Local",Beech C23,Part 91: General aviation - Personal,WPR23LA099,N23FD,1 None,Substantial,2023-01-24,2023-01-24T17:45:00Z,s3://aryn-public/ntsb/20.pdf,2
476,"Eagle River, Wisconsin","January 25, 2023, 13:00 Local",Cessna 120,Part 91: General aviation - Personal,CEN23LA098,N73034,1 None,Substantial,2023-01-25,2023-01-25T13:00:00Z,s3://aryn-public/ntsb/18.pdf,2
477,"Opa-locka, Florida","January 3, 2023, 12:35 Local",Cessna 172N,Part 91: General aviation - Instructional,ERA23LA104,N2354E,2 None,Substantial,2023-01-03,2023-01-03T12:35:00Z,s3://aryn-public/ntsb/94.pdf,0
478,"Kenai, Alaska","January 2, 2023, 14:57 Local",Piper PA-18-150,Part 91: General aviation - Personal,ANC23LA011,N94X,1 Minor,Substantial,2023-01-02,2023-01-02T14:57:00Z,s3://aryn-public/ntsb/96.pdf,1
