In [1]:
import networkx as nx
import pandas as pd
from tqdm import tqdm
import re

In [2]:
def norm_package_name(name: str) -> str:
    lowered = name.lower()
    no_hyphen_or_period = lowered.replace("-", "_").replace(".", "_")
    full_norm = re.sub(
        r"([a-z0-9\_]+)(\>|\<|\!|\=|\[){0,1}(.)*",
        r"\1",
        no_hyphen_or_period,
    )
    return full_norm

def process_package(
    df: pd.DataFrame,
    package_details: pd.Series,
    graph: nx.DiGraph,
    processed_nodes: set,
):
    # Get normed package name
    package_name = norm_package_name(package_details["name"])

    # Create node if package not in graph
    if package_name not in processed_nodes:
        graph.add_node(
            package_name,
            downloads=package_details.downloads,
            keywords=", ".join(package_details.keywords_array),
        )
        processed_nodes.add(package_name)

    # For each dependency in the "latest_version" column
    this_node_deps = set()
    for dep in package_details.latest_version["dependencies"]:
        if dep is not None and dep["package_name"] is not None:
            # Get dep name
            dep_name = norm_package_name(dep["package_name"])
            this_node_deps.add(dep_name)

            # Handle not in graph
            if dep_name not in processed_nodes:
                dep_details = df[df["name"] == dep_name]
                if len(dep_details) == 1:
                    dep_details = dep_details.iloc[0]
                    process_package(
                        df,
                        dep_details,
                        graph,
                        processed_nodes,
                    )
                elif len(dep_details) > 1:
                    raise ValueError(f"multiple packages with name: '{dep_name}'")
    
    # Add edges
    for dep in this_node_deps:
        graph.add_edge(package_name, dep)

In [3]:
# Read in data
df = pd.read_json("../data/pypi.ndjson", lines=True)

# Create graph management
graph = nx.DiGraph()
processed_nodes = set()
for _, row in tqdm(df.iterrows(), total=len(df)):
    process_package(
        df,
        row,
        graph,
        processed_nodes,
    )

# Save graph
nx.write_gexf(graph, "pypi.gexf")

100%|██████████| 14950/14950 [00:36<00:00, 404.63it/s] 


In [4]:
edgelist = nx.to_pandas_edgelist(graph)
edgelist.to_csv("pypi_edgelist.csv", index=False)