In [15]:
"""This notebook downloads a website for you, puts it into a tree structure to 
give you a complete overview, and then asks weaviate to summarise each page.
It assumes you have the prerquisites installed, (with the summarisation 
Weaviate docker running).
"""
## Import the used pip packages.
import json
import networkx as nx
import os
import sys
import weaviate

from typeguard import typechecked
from typing import Dict, List, Union
from weaviate import Client


## Import some backend functions that get the website data into Weaviate.
from src.pythontemplate.arg_parsing.arg_parser import parse_skip_upload
from src.pythontemplate.arg_parsing.verify_configuration import (
    verify_configuration,
)
from src.pythontemplate.get_website_data.get_website_data_manager import (
    get_nx_graph_of_website,
)
from src.pythontemplate.helper import create_output_dir
from src.pythontemplate.load_json_into_weaviate.import_local_json import (
    load_local_json_data_into_weaviate,
)
from src.pythontemplate.visualise_graph.plot_url_structure_to_image import (
    plot_url_structure_to_svg_pdf_png,
)
from src.pythontemplate.visualise_graph.url_structure_to_d3_json import (
    export_url_structure_for_d3,
    get_url_dictionary,
)
from src.pythontemplate.get_website_data.nx_graph_json_bridge import (
    load_from_json,
)
from src.pythontemplate.helper import get_output_path


## Initialise parameters
# Specify the page you want to study here:
company_urls: List[str] = ["https://weaviate.io", "https://trucol.io"]

nx_json_filename: str = "website_data.json"
summarised_json_filename: str = "summarised_by_weaviate.json"
d3_json_filename: str = "d3_data.json"
graph_plot_filename: str = "website_url_structure"


# For this repo the Weaviate data classes are web pages.
json_object_name: str = "WebPage"  # Must start with Capitalised letter.
summarised_property: str = "webPageMainText"
weaviate_local_host_url: str = "http://localhost:8080"

max_nr_of_queries: int = 3  # Used to prevent timeout error.
output_dir: str = "frontend/output_data"


In [16]:
"""Perform queries to Weaviate to summarise the data."""

from src.pythontemplate.load_json_into_weaviate.import_local_json import (
    get_hash,
)
def ensure_weaviate_summaries_are_available(
    summarised_json_filename: str,
    weaviate_local_host_url: str,
    json_object_name: str,
    summarised_property: str,
    output_dir: str,
    company_url: str,
) -> Dict:  # type: ignore[type-arg]

    summarised_json_filepath: str = get_output_path(
        output_dir=output_dir,
        company_url=company_url,
        filename=summarised_json_filename,
    )

    print("Ensuring Weaviate summaries are available.")
    # Perform queries to Weaviate to summarise the data.
    # summarised_data: Union[Dict, None]  # type: ignore[type-arg]
    summarised_data: Dict  # type: ignore[type-arg]
    if not os.path.exists(summarised_json_filepath):
        print("Generating new summaries.")
        summarised_data = ask_weaviate_to_summarise(
            weaviate_local_host_url=weaviate_local_host_url,
            json_object_name=json_object_name,
            summarised_property=summarised_property,
        )
        with open(summarised_json_filepath, "w") as f:
            json.dump(
                summarised_data, f, indent=4
            )  # Add indentation for readability

    else:
        print("Loaded Weaviate summaries from file.")
        summarised_data = load_from_json(filepath=summarised_json_filepath)

    return summarised_data


def ask_weaviate_to_summarise(
    *,
    weaviate_local_host_url: str,
    json_object_name: str,
    summarised_property: str,
) -> Dict[str, Dict[str, Dict[str, List]]]:  # type: ignore[type-arg]
    """Working configuration:

    json_object_names="Question", summarised_property="theAnswer"
    """
    client = weaviate.Client(weaviate_local_host_url)

    client.query.get(json_object_name)
    urls = [
        obj["url"]
        for obj in client.query.get(json_object_name, ["url"])
        .with_limit(1000)
        .do()["data"]["Get"][json_object_name]
    ]

    summarised_json: Dict[  # type: ignore[type-arg]
        str, Dict[str, Dict[str, List]]
    ] = {"data": {"Get": {"WebPage": []}}}
    if len(urls) != len(list(set(urls))):
        raise ValueError("Duplicate url found.")

    for i, url in enumerate(urls):
        print(f"summarizing website: i={i}, url={url}")

        result = weaviate_summary_query_on_single_text(
            client,
            json_object_name,
            summarised_property,
            get_hash(some_str=url),
        )

        summarised_json["data"]["Get"]["WebPage"].append(
            result["data"]["Get"]["WebPage"][0]
        )
    return summarised_json


def weaviate_summary_query_on_single_text(
    client: Client,
    json_object_name: str,
    summarised_property: str,
    url_hash: str,
) -> Dict:  # type: ignore[type-arg]
    result: Dict = (  # type: ignore[type-arg]
        client.query.get(
            json_object_name,
            [
                summarised_property,
                (
                    '_additional { summary ( properties: ["'
                    + summarised_property
                    + '"]) { property result } }'
                ),
                "url",
            ],
        )
        .with_where(
            {
                "path": ["urlHash"],
                "operator": "Equal",
                # url hash is used because equal behaves as contains.
                "valueText": url_hash,
            }
        )
        .do()
    )
    return result


# @typechecked
def inject_summarisation_into_website_graph(
    data: Dict,  # type: ignore[type-arg]
    website_graph: nx.DiGraph,
    max_nr_of_queries: int,
    json_object_name: str,
    summarised_property: str,
) -> None:
    """This function injects a summary into a website graph.

    Args: :data: (Dict), A dictionary containing the summary data
    :website_graph: (nx.DiGraph), A NetworkX directed graph representing the
    website :max_nr_of_queries: (int), The maximum number of summaries to
    inject :json_object_name: (str), The name of the JSON object containing the
    summaries :summarised_property: (str), The name of the property in the JSON
    object containing the summaries
    """

    vals = data["data"]["Get"][json_object_name]
    print(f"Number of webpages={len(vals)}")
    for i, node in enumerate(website_graph.nodes):
        # if i < max_nr_of_queries:

        original_main_text: str = get_original_text_from_summary_response(
            single_summary=vals[i], summarised_property=summarised_property
        )
        weaviate_summary: str = get_summary_response(single_summary=vals[i])
        summary_url: str = get_summary_url(single_summary_with_url=vals[i])
        for node in website_graph.nodes:
            if node == summary_url:
                website_graph.nodes[node]["summary"] = weaviate_summary

                if (
                    website_graph.nodes[node]["text_content"]
                    != original_main_text
                ):
                    print(
                        "website_graph.nodes[node]="
                        + f"{website_graph.nodes[node]}"
                    )
                    raise ValueError(
                        "The text_content values of summary and website"
                        " graph don't match."
                    )


def get_original_text_from_summary_response(
    *,
    single_summary: Dict[str, Dict[str, Union[str, List[Dict[str, str]]]]],
    summarised_property: str,
) -> str:
    """Gets the original text from a Weaviate summary response.

    Args: :single_summary: (Dict[str, Dict[str, Union[str, List[Dict[str,
    str]]]]]), The single summary element from a Weaviate query response.
    :summarised_property: (str), The name of the property in the summary
    response that contains the original text. Returns: The original text that
    was extracted from the web page.
    """
    if not isinstance(single_summary, dict):
        raise TypeError("Expected Dict.")
    if not isinstance(single_summary[summarised_property], str):
        raise TypeError(
            "Expected summarized property to be a string,"
            + f" yet it was:{single_summary} of type:{type(single_summary)}."
        )
    return str(single_summary[summarised_property])


def get_summary_response(
    *, single_summary: Dict[str, Dict[str, Union[str, List[Dict[str, str]]]]]
) -> str:
    """Returns the Weaviate summary of the original main text that was
    extracted from the web page.

    Assumes the single summary element has a valid structure.
    """
    if len(single_summary["_additional"]["summary"]) > 0:
        if not isinstance(single_summary, dict):
            raise TypeError("Expected Dict.")
        if not isinstance(single_summary["_additional"], dict):
            raise TypeError("Expected Dict in additional.")
        if not isinstance(single_summary["_additional"]["summary"], List):
            raise TypeError("Expected List.")
        if not isinstance(single_summary["_additional"]["summary"][0], dict):
            raise TypeError("Expected Dict within List.")
        if not isinstance(
            single_summary["_additional"]["summary"][0]["result"], str
        ):
            raise TypeError("Expected the summary response to be a string.")
        return single_summary["_additional"]["summary"][0]["result"]
    else:
        return "No web page text found, so no summary available."


def get_summary_url(
    *,
    single_summary_with_url: Dict[
        str, Dict[str, Union[str, List[Dict[str, str]]]]
    ],
) -> str:
    """Returns the url belonging to the Weaviate summary.

    Assumes the single summary element has a valid structure. Args:
    :single_summary_with_url: (Dict[str, Dict[str, Union[str, List[Dict[str,
    str]]]]]), A Weaviate summary with valid structure. Returns: The URL of the
    Weaviate summary.
    """
    if not isinstance(single_summary_with_url["url"], str):
        raise TypeError("Expected the url to be a string.")
    return single_summary_with_url["url"]


In [17]:
"""Get the website data and store it as a nx.graph."""
def get_summarised_website_tree(
    *, company_url: str, skip_weaviate_upload: bool
) -> None:
    """Retrieves the website structure of a company.

    Args: :company_url: (str), URL of the company website. Returns: This
    function does not directly return data. Instead, it processes the website
    data and generates various outputs, including:* A summarized website data
    stored in Weaviate* A URL structure dictionary (`url_structure`)* A D3 JSON
    output file for frontend visualization (`d3_json_filename`)* PDF, SVG, and
    PNG visualizations of the website structure (`graph_dict`)
    """
    create_output_dir(company_url=company_url, output_dir=output_dir)

    website_graph: nx.DiGraph = get_nx_graph_of_website(
        # output_filepath=output_filepath,
        nx_json_filename=nx_json_filename,
        company_url=company_url,
        output_dir=output_dir,
    )

    if not skip_weaviate_upload:
        load_local_json_data_into_weaviate(
            weaviate_local_host_url=weaviate_local_host_url,
            json_input_path=nx_json_filename,
            json_object_name=json_object_name,
            summarised_property=summarised_property,
            output_dir=output_dir,
            company_url=company_url,
        )

    summarised_data = ensure_weaviate_summaries_are_available(
        summarised_json_filename=summarised_json_filename,
        weaviate_local_host_url=weaviate_local_host_url,
        json_object_name=json_object_name,
        summarised_property=summarised_property,
        output_dir=output_dir,
        company_url=company_url,
    )

    # Export summaries
    inject_summarisation_into_website_graph(
        data=summarised_data,
        website_graph=website_graph,
        max_nr_of_queries=max_nr_of_queries,
        json_object_name=json_object_name,
        summarised_property=summarised_property,
    )
    url_structure: Dict = get_url_dictionary(  # type: ignore[type-arg]
        G=website_graph, root_url=company_url
    )
    # For frontend.
    export_url_structure_for_d3(
        url_structure=url_structure,
        website_graph=website_graph,
        d3_json_filename=d3_json_filename,
        output_dir=output_dir,
        company_url=company_url,
    )
    plot_url_structure_to_svg_pdf_png(
        graph_dict={company_url: url_structure},
        nx_graph=website_graph,
        graph_plot_filename=graph_plot_filename,
        output_dir=output_dir,
        company_url=company_url,
    )


for company_url in company_urls:
    get_summarised_website_tree(
        company_url=company_url, skip_weaviate_upload=False
    )


Ensuring Weaviate summaries are available.
Loaded Weaviate summaries from file.
Number of webpages=929


            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


Ensuring Weaviate summaries are available.
Loaded Weaviate summaries from file.
Number of webpages=7
