In [1]:
import pandas as pd
import networkx as nx
from tqdm import tqdm

df = pd.read_json("../data/bioconductor.ndjson", lines=True)
df["latest_release_published_at"] = df["latest_release_published_at"].astype(str)
df.sample(3)

Unnamed: 0,id,registry_id,name,ecosystem,description,homepage,licenses,repository_url,normalized_licenses,versions_count,...,rankings,namespace,advisories,maintainers_count,first_release_published_at,keywords,docker_dependents_count,docker_downloads_count,issue_metadata,latest_version
1408,8420236,58,Organism.dplyr,bioconductor,dplyr-based Access to Bioconductor Annotation ...,,Artistic-2.0,,[Artistic-2.0],1,...,"{'downloads': 19.847856154910097, 'dependent_r...",,[],1,2023-10-12 14:50:27.601000+00:00,"[Annotation, GenomeAnnotation, Sequencing, Sof...",,,,"{'id': 89095484, 'package_id': 8420236, 'numbe..."
920,8420313,58,PING,bioconductor,Probabilistic inference for Nucleosome Positio...,,Artistic-2.0,,[Artistic-2.0],1,...,"{'downloads': 31.800262812089358, 'dependent_r...",,[],1,2023-10-12 14:52:04.943000+00:00,"[Clustering, Sequencing, Software, Statistical...",,,,"{'id': 89095567, 'package_id': 8420313, 'numbe..."
795,8419992,58,msa,bioconductor,Multiple Sequence Alignment,http://www.bioinf.jku.at/software/msa/,GPL (>= 2),http://www.bioinf.jku.at/software/msa/,[CNRI-Python-GPL-Compatible],1,...,"{'downloads': 8.498168498168498, 'dependent_re...",,[],1,2023-10-12 14:46:53.252000+00:00,"[Alignment, MultipleComparison, MultipleSequen...",,,,"{'id': 89095108, 'package_id': 8419992, 'numbe..."


In [2]:
df.columns

Index(['id', 'registry_id', 'name', 'ecosystem', 'description', 'homepage',
       'licenses', 'repository_url', 'normalized_licenses', 'versions_count',
       'latest_release_published_at', 'latest_release_number',
       'keywords_array', 'language', 'status', 'last_synced_at', 'created_at',
       'updated_at', 'metadata', 'repo_metadata', 'repo_metadata_updated_at',
       'dependent_packages_count', 'downloads', 'downloads_period',
       'dependent_repos_count', 'rankings', 'namespace', 'advisories',
       'maintainers_count', 'first_release_published_at', 'keywords',
       'docker_dependents_count', 'docker_downloads_count', 'issue_metadata',
       'latest_version'],
      dtype='object')

In [3]:
df.latest_version.iloc[0]

{'id': 89092443,
 'package_id': 8418609,
 'number': '1.68.0',
 'published_at': '2023-10-12T14:10:28.218Z',
 'licenses': None,
 'integrity': None,
 'status': None,
 'created_at': '2023-10-12T14:10:28.218Z',
 'updated_at': '2023-10-12T14:10:28.218Z',
 'metadata': {},
 'dependencies': [{'id': 975828933,
   'package_id': None,
   'version_id': 89092443,
   'package_name': 'graphics',
   'ecosystem': 'bioconductor',
   'kind': 'imports',
   'optional': False,
   'requirements': '*'},
  {'id': 975828934,
   'package_id': None,
   'version_id': 89092443,
   'package_name': 'grDevices',
   'ecosystem': 'bioconductor',
   'kind': 'imports',
   'optional': False,
   'requirements': '*'},
  {'id': 975828935,
   'package_id': None,
   'version_id': 89092443,
   'package_name': 'methods',
   'ecosystem': 'bioconductor',
   'kind': 'imports',
   'optional': False,
   'requirements': '*'},
  {'id': 975828937,
   'package_id': None,
   'version_id': 89092443,
   'package_name': 'stats',
   'ecosystem'

In [4]:
processed_nodes = set()
graph = nx.Graph()
for _, row in tqdm(df.iterrows(), total=len(df)):
    # Get current node name
    node_name = row["name"]

    # Fast handle none
    if node_name is None:
        continue

    # Create node if package not in graph
    if node_name not in processed_nodes:
        graph.add_node(
            node_name,
            last_release=row.latest_release_published_at,
            downloads=row.downloads,
        )
        processed_nodes.add(node_name)

    # For each dependency in the "latest_version" column,
    # check if the dependency is already in the graph, if not, add it.
    # Then add an edge between the package and the dependency.
    dependency_names = []
    for dep in row.latest_version["dependencies"]:
        # Fast handle none
        if dep is None or dep["package_name"] is None:
            continue
    
        # Get dep name
        dep_name = dep["package_name"]

        # Handle not in graph
        if dep_name not in processed_nodes:
            dep_details = df[df["name"] == dep_name]
            if len(dep_details) == 0:
                continue
            
            else:
                dep_details = dep_details.iloc[0]
                last_published = dep_details.latest_release_published_at
                graph.add_node(
                    dep_name,
                    last_release=last_published,
                    downloads=dep_details.downloads,
                )
                processed_nodes.add(dep_name)

        # Add edge
        graph.add_edge(node_name, dep_name)

nx.write_gexf(graph, "bioconductor.gexf")

100%|██████████| 1420/1420 [00:10<00:00, 133.06it/s]
