In [2]:
import json

with open("shared.json") as f:
    dois = json.load(f)

In [4]:
import sqlite3
# Dataset of a Study of Computational reproducibility of Jupyter notebooks from biomedical publications
# https://zenodo.org/records/8226725
con = sqlite3.connect("../../hackathon/computational-reproducibility-pmc/computational-reproducibility-pmc/analyses/db.sqlite")
cur = con.cursor()

In [38]:
from operator import or_
from functools import reduce

class Hashabledict(dict):
    def __hash__(self):
        return hash(frozenset(self))

imports_in_notebooks_by_doi = {}
for doi in dois:
    res = cur.execute("""
        select m.external_any, n.name from notebook_modules m
        inner join notebooks n on (m.notebook_id = n.id and m.repository_id = n.repository_id)
        inner join repositories r on (r.id = m.repository_id)
        inner join article a on (a.id = r.article_id)
        where a.doi = ?
    """, (doi,))
    imports_in_notebooks_by_doi[doi] = reduce(or_, [
        {Hashabledict({
            'local': -1, # unknown
            'filename': x[1], 
            'filetype': 'notebook',
            'name': package.split('.')[0]
        }) for package in x[0].split(',') if package.split('.')[0]}
        for x in res.fetchall()], set())

In [33]:
from extract_packages import import_data_to_package, PackageExtractorConfig
config = PackageExtractorConfig(use_bq=True)

In [39]:

packages_in_notebooks_by_doi = {}


for doi, imports in imports_in_notebooks_by_doi.items():
    packages = import_data_to_package(imports, config=config)
    packages_in_notebooks_by_doi[doi] = packages


In [46]:
from collections import defaultdict
simplified_packages_by_doi = defaultdict(set)
unknown_packages_by_doi = defaultdict(set)
for doi, packages in packages_in_notebooks_by_doi.items():
    for package in packages:
        if package['mode'] == 'unknown':
            unknown_packages_by_doi[doi].add(package['importname'])
        elif package['name'] != '<builtin>':
            simplified_packages_by_doi[doi].add(package['name'])


In [49]:
len(reduce(or_, unknown_packages_by_doi.values()))

467

In [50]:
len(reduce(or_, simplified_packages_by_doi.values()))

911

In [51]:
simplified_packages_by_doi

defaultdict(set,
            {'10.15252/msb.20199235': {'cobra'},
             '10.1021/acs.jcim.1c00428': {'numpy'},
             '10.1186/s40462-021-00268-4': {'Cartopy',
              'Fiona',
              'GPU-BSM',
              'Shapely',
              'geopandas',
              'geoplot',
              'hvplot',
              'matplotlib',
              'movingpandas',
              'numpy',
              'osgeo',
              'pandas',
              'requests',
              'scikit-learn',
              'tqdm',
              'viresclient'},
             '10.1093/hmg/ddaa212': {'UpSetPlot',
              'ipython',
              'matplotlib',
              'numpy',
              'pandas',
              'scipy'},
             '10.1093/bioinformatics/btaa578': {'autoreload',
              'biopython',
              'functions',
              'matplotlib',
              'numpy',
              'pandas',
              'scikit-learn',
              'scipy',
              'seaborn',

In [54]:
import pandas as pd
df = pd.read_json("../../hackathon/pypi_with_mentions.ndjson", lines=True)

In [76]:
czi_map = {}
for i, row in df.iterrows():
    czi_map[str(row["name"]).lower().replace('-', '_').replace('.', '_')] = row["czi_id"]

In [77]:
enduser_doi_to_czi_map = defaultdict(set)
unknown_czi = defaultdict(set)

for doi, packages in simplified_packages_by_doi.items():
    for package in packages:
        package = package.lower().replace('-', '_').replace('.', '_')
        if czi_id := czi_map.get(package, None):
            enduser_doi_to_czi_map[doi].add(czi_id)
        else:
            unknown_czi[doi].add(package)


In [84]:
import json

with open("enduser_doi_to_czi.json", "w") as f:
    json.dump({k: list(v) for k, v in enduser_doi_to_czi_map.items()}, f)


In [80]:
reduce(or_, unknown_czi.values())

{'abipy',
 'abutils',
 'actonet',
 'adjusttext',
 'afqinsight',
 'aicsimageio',
 'aicssegmentation',
 'aiida',
 'allensdk',
 'amici',
 'anndata2ri',
 'apiclient',
 'appyter',
 'asaplib',
 'assimp',
 'astroml',
 'autoencoder',
 'autograd',
 'autoreload',
 'awkward',
 'awkwardql',
 'azureml',
 'baselines',
 'bayesflow',
 'bdpy',
 'beakerx',
 'bebi103',
 'benchml',
 'bertviz',
 'biocommons',
 'bioexplorer',
 'blackcellmagic',
 'bluegraph',
 'bluepyefe',
 'bluepysnap',
 'bluesearch',
 'bond',
 'bonesis',
 'bootstrapped',
 'botometer',
 'bqplot',
 'brainiak',
 'brainsmash',
 'brainspace',
 'branca',
 'bravado',
 'brewer2mpl',
 'brian2tools',
 'cabean',
 'calour',
 'capalyzer',
 'casq',
 'celerite',
 'celluloid',
 'charcoal',
 'cimcb',
 'cirq',
 'clawpack',
 'clodsa',
 'cogent3',
 'colicoords',
 'common',
 'condacolab',
 'configspace',
 'constants',
 'convis',
 'cotengra',
 'croissance',
 'crossref',
 'ctk',
 'cuml',
 'cupy',
 'cylouvain',
 'cyrand',
 'cytopy',
 'datapackage',
 'datashader',