In [None]:
from pathlib import Path
# from dsnotebooks.settings import ProjectNotebookSettings

# notebook settings auto-loaded from .env / env vars
# notebook_settings = ProjectNotebookSettings()

# PROFILE_NAME = notebook_settings.profile  # the profile to use
# PROJ_KEY = notebook_settings.proj_key     # the project to use
PROFILE_NAME ="ds-experience"   # the profile to use
PROJ_KEY = "1234567890abcdefghijklmnopqrstvwyz123456"     # the project to use
INPUT_FILE = Path("demo_1.docx")

In [None]:
# Import standard dependenices
import json
import tempfile
from zipfile import ZipFile
import typer
import pandas as pd

# IPython utilities
from IPython.display import display, Markdown, HTML

# Import the deepsearch-toolkit
import deepsearch as ds

In [None]:
def extract_tables_from_json_doc(pdf_filename: Path, document: dict, output_dir: Path):
    """
    Iterate through the converted document format and extract the figures as PNG files

    Parameters
    ----------
    pdf_filename : Path
        Input PDF file.
    document :
        The converted document from Deep Search.
    output_dir : Path
        Output directory where all extracted images will be saved.
    """

    output_base = output_dir / document["file-info"]["filename"].rstrip(".pdf").rstrip(
        ".PDF"
    )
    page_counters = {}
    # Iterate through all the tables identified in the converted document
    for table in document.get("tables", []):
        prov = table["prov"][0]
        page = prov["page"]
        page_counters.setdefault(page, 0)
        page_counters[page] += 1

        # Load the table into a Pandas DataFrame
        table_content = [[cell["text"] for cell in row] for row in table["data"]]
        df = pd.DataFrame(table_content)

        # Save table
        output_filename = output_base.with_name(
            f"{output_base.name}_{page}_{page_counters[page]}.csv"
        )
        df.to_csv(output_filename)

        typer.secho(f"Table extracted in {output_filename}", fg=typer.colors.GREEN)



In [None]:
api = ds.CpsApi.from_env(profile_name=PROFILE_NAME)

In [None]:

%%time
output_dir = "C:\out"
# output_dir = tempfile.mkdtemp() # TODO: switch to tempfile.TemporaryDirectory() and use `with`
# print(output_dir)
documents.download_all(result_dir=output_dir, progress_bar=True)

for output_file in Path(output_dir).rglob("json*.zip"):
    with ZipFile(output_file) as archive:
        all_files = archive.namelist()
        for name in all_files:
            if name.endswith(".json"):
                typer.secho(
                    f"Procecssing file {name} in archive {output_file}",
                    fg=typer.colors.BLUE,
                )
                document = json.loads(archive.read(name))
                pdf_filename = Path(output_dir) / document["file-info"]["filename"]
#                 extract_figures_from_json_doc(pdf_filename, document, output_dir, 72)
extract_tables_from_json_doc((pdf_filename, document, output_dir))

In [None]:
documents = ds.convert_documents(
    api=api, proj_key=PROJ_KEY, source_path=INPUT_FILE, progress_bar=True
)

Submitting input:     : |[38;2;15;98;254m                              [0m| 0/0 [00:00<?, ?it/s][38;2;15;98;254m                                                                                          [0m
Converting input:     : |[38;2;15;98;254m                              [0m| 0/0 [00:00<?, ?it/s][38;2;15;98;254m                                                                                          [0m


In [None]:
import json

# Open the JSON file and load its contents
with open('SEC-ESG-Mar24-FinalRule-Factsheet_2.json', 'r') as file:
    data = json.load(file)

# Extract text from main-text
extracted_text = [item['text'] for item in data['main-text'] if 'text' in item]

# Print extracted text
for text in extracted_text:
    print(text)


FACT SHEET The Enhancement and Standardization of Climate-Related Disclosures: Final Rules
On March 6, 2024, the Securities and Exchange Commission adopted final rules to require registrants to disclose certain climate-related information in registration statements and annual reports. The Commission proposed the rules on March 21, 2022. The public comment file is available online.
The final rules require a registrant to disclose, among other things: material climate-related risks; activities to mitigate or adapt to such risks; information about the registrant's board of directors' oversight of climate-related risks and management's role in managing material climate-related risks; and information on any climate-related targets or goals that are material to the registrant's business, results of operations, or financial condition.
Further, to facilitate investors' assessment of certain climate-related risks, the final rules require disclosure of Scope 1 and/or Scope 2 greenhouse gas (GHG)