In [1]:
import networkx as nx
import pandas as pd
from tqdm import tqdm
import json
import re

In [2]:
REGISTRY_TO_ALLOWED_DEP_KINDS_LUT = {
    "bioconductor": ["imports", "depends"],
    "cran": ["imports", "depends"],
    "pypi": ["runtime", "python_version"],
}

REGISTRY_TO_NORM_PAKCAGE_NAME_LUT = {
    "bioconductor": False,
    "cran": False,
    "pypi": True,
}

def norm_package_name(name: str) -> str:
    lowered = name.lower()
    no_hyphen_or_period = lowered.replace("-", "_").replace(".", "_")
    full_norm = re.sub(
        r"([a-z0-9\_]+)(\>|\<|\!|\=|\[){0,1}(.)*",
        r"\1",
        no_hyphen_or_period,
    )
    return full_norm

def process_package(
    df: pd.DataFrame,
    package_details: pd.Series,
    graph: nx.DiGraph,
    processed_nodes: set,
    allowed_dep_kinds: list[str] | None = None,
    do_norm_package_name: bool = False,
    depth: int = 0,
    print_depth: bool = False,
):
    # Handle no ecosystem
    if package_details["ecosystem"] is None:
        return

    # Get normed package name
    if do_norm_package_name:
        package_name = norm_package_name(package_details["name"])
    else:
        package_name = package_details["name"]
    
    # Handle printing
    if print_depth:
        print("\t" * depth + package_name)

    # Create node if package not in graph
    if package_name not in processed_nodes:
        graph.add_node(
            package_name,
            czi_id=package_details.czi_id,
            keywords=", ".join(package_details.keywords_array),
            mentions_count=package_details.mentions_count,
            ecosystem=package_details.ecosystem,
        )
        processed_nodes.add(package_name)

    # For each dependency in the "latest_version" column
    this_node_deps = set()
    for dep in package_details.latest_version["dependencies"]:
        if dep is not None and dep["package_name"] is not None:
            # Get dep name
            if do_norm_package_name:
                dep_name = norm_package_name(dep["package_name"])
            else:
                dep_name = dep["package_name"]
            
            # Get dep kind
            dep_kind = str(dep["kind"])

            # If its not allowed
            if allowed_dep_kinds is None or dep_kind in allowed_dep_kinds:
                # Handle not in graph
                if dep_name not in processed_nodes:
                    dep_details = df[df["name"] == dep_name]
                    if len(dep_details) == 1:
                        dep_details = dep_details.iloc[0]
                        result = process_package(
                            df,
                            dep_details,
                            graph,
                            processed_nodes,
                            depth=depth+1,
                            print_depth=print_depth,
                            do_norm_package_name=do_norm_package_name,
                            allowed_dep_kinds=allowed_dep_kinds,
                        )

                        # Check if the node _should_ be added
                        if result is not None:
                            this_node_deps.add(dep_name)

                    elif len(dep_details) > 1:
                        raise ValueError(f"multiple packages with name: '{dep_name}'")
    
    # Add edges
    for dep_name in this_node_deps:
        graph.add_edge(package_name, dep_name)

    return True

def graph_from_registry(
    registry: pd.DataFrame,
    registry_name: str,
    outfile: str,
    print_depth: bool = False,
):
    # Create graph management
    graph = nx.DiGraph()
    processed_nodes = set()
    for _, row in tqdm(registry.iterrows(), total=len(registry)):
        process_package(
            registry,
            row,
            graph,
            processed_nodes,
            allowed_dep_kinds=REGISTRY_TO_ALLOWED_DEP_KINDS_LUT[registry_name],
            do_norm_package_name=REGISTRY_TO_NORM_PAKCAGE_NAME_LUT[registry_name],
            print_depth=print_depth,
        )
    
    nx.write_gexf(graph, outfile)

def graph_from_czi_ids(
    registries: dict[str, pd.DataFrame],
    czi_ids: list[str],
    print_depth: bool = False,
    starting_node: str | None = None,
    starting_node_attrs: dict[str, str] | None = None,
    graph: nx.DiGraph | None = None,
    processed_nodes: set[str] | None = None,
):
    # Init graph and storage
    if graph is None:
        graph = nx.DiGraph()
    if processed_nodes is None:
        processed_nodes = set()

    # If starting node
    if starting_node:
        graph.add_node(
            starting_node,
            **starting_node_attrs,
        )
    
    # For each czi_id
    for registry_name, registry in registries.items():
        for czi_id in czi_ids:
            package_of_interest = registry.loc[registry.czi_id == czi_id]
            if len(package_of_interest) == 1:
                # Actual get
                package_of_interest = package_of_interest.iloc[0]

                # Get normed name for package
                if REGISTRY_TO_NORM_PAKCAGE_NAME_LUT[registry_name]:
                    package_name = norm_package_name(package_of_interest["name"])
                else:
                    package_name = package_of_interest["name"]

                # Create graph just for this package
                result = process_package(
                    registry,
                    package_of_interest,
                    graph,
                    processed_nodes,
                    allowed_dep_kinds=REGISTRY_TO_ALLOWED_DEP_KINDS_LUT[registry_name],
                    do_norm_package_name=REGISTRY_TO_NORM_PAKCAGE_NAME_LUT[registry_name],
                    print_depth=print_depth,
                )

                if result is not None and starting_node is not None:
                    # Add edges from start node to processed
                    graph.add_edge(starting_node, package_name)

In [3]:
def load_registry(registry_path: str) -> pd.DataFrame:
    # Read in data
    df = pd.read_json(registry_path, lines=True)
    df.czi_id = df.czi_id.astype(str)

    return df

# load all registries
registries = {
    "pypi": load_registry("../data/pypi_with_mentions.ndjson"),
    "cran": load_registry("../data/cran_with_mention_counts.ndjson"),
    "bioconductor": load_registry("../data/bioconductor_with_mention_counts.ndjson"),
}

# load doi to czi_id lut
with open("../data/comm_disambiguated_cvis_count.json") as open_f:
    doi_to_czi_id_lut = json.load(open_f)

In [4]:
import random

# Init graph
graph = nx.DiGraph()
processed_nodes = set()

# Sampled DOIs
sampled_dois = random.sample(list(doi_to_czi_id_lut.keys()), 1000)

for doi in tqdm(
    sampled_dois,
    total=len(sampled_dois),
    desc="Processing DOIs",
):
    # Get normed DOI
    norm_doi = doi.replace("/", "-").replace(".", "_")

    # Create graph
    graph_from_czi_ids(
        registries=registries,
        czi_ids=doi_to_czi_id_lut[doi],
        graph=graph,
        processed_nodes=processed_nodes,
        starting_node=doi,
        starting_node_attrs={
            "ecosystem": "paper",
        },
    )

# Write out graph
nx.write_gexf(graph, "dois-graph.gexf")

Processing DOIs: 100%|██████████| 1000/1000 [00:20<00:00, 49.06it/s]
