In [1]:
import networkx as nx
import pandas as pd
from tqdm import tqdm

In [2]:
def process_registry(registry: str) -> str:
    # Read registry dataset
    df = pd.read_json(f"../data/{registry}.ndjson", lines=True)
    df.latest_release_published_at = df.latest_release_published_at.astype(str)
    df.downloads = df.downloads.astype(str)
    df.mentioned = df.mentioned.astype(str)

    # Start processing and store processed nodes
    processed_nodes = set()
    graph = nx.DiGraph()
    for _, row in tqdm(
        df.iterrows(),
        total=len(df),
        desc=f"Processing {registry}",
    ):
        # Get current node name
        node_name = row["name"]

        # Fast handle none
        if node_name is None:
            continue

        # Create node if package not in graph
        if node_name not in processed_nodes:
            graph.add_node(
                node_name,
                last_release=row.latest_release_published_at,
                downloads=row.downloads,
                keywords=", ".join(row.keywords_array),
                mentioned=row.mentioned,
            )
            processed_nodes.add(node_name)

        # For each dependency in the "latest_version" column,
        # check if the dependency is already in the graph, if not, add it.
        # Then add an edge between the package and the dependency.
        for dep in row.latest_version["dependencies"]:
            # Fast handle none
            if dep is None or dep["package_name"] is None:
                continue

            # Get dep name
            dep_name = dep["package_name"]

            # Handle not in graph
            if dep_name not in processed_nodes:
                dep_details = df[df["name"] == dep_name]
                if len(dep_details) == 0:
                    continue

                else:
                    dep_details = dep_details.iloc[0]
                    last_published = dep_details.latest_release_published_at
                    graph.add_node(
                        dep_name,
                        last_release=last_published,
                        downloads=dep_details.downloads,
                        keywords=", ".join(dep_details.keywords_array),
                        mentioned=row.mentioned,
                    )
                    processed_nodes.add(dep_name)

            # TODO: store "dep type" on the edge

            # Add edge
            graph.add_edge(node_name, dep_name)

    # Write out file
    outfile = f"{registry}.gexf"
    nx.write_gexf(graph, outfile)

    return outfile

In [3]:
for registry in [
    # "bioconductor",
    "cran_with_commits",
    # "pypi",
]:
    process_registry(registry)

Processing cran_with_commits: 100%|██████████| 7262/7262 [00:15<00:00, 476.68it/s]
