In [2]:
import pandas as pd
import ijson

import ast
import os


In [4]:
filename = "nde_hub.nde_all_prod_20240214_b9kgi6tc.json"
data = []

with open(filename, 'r') as file:
    objects = ijson.items(file, 'item')
    for obj in objects:
        data.append(obj)

In [3]:
prod_data = pd.DataFrame(data)

In [4]:
prod_data.rename(columns={'name': 'Name', 'description': 'Description', 'date': 'Date', 'url':'URL', 'abstract': 'Abstract'}, inplace=True)


In [5]:
prod_data.head()

Unnamed: 0,_id,Date,Description,includedInDataCatalog,Name,URL,Abstract
0,VIVLI_00082385-f73e-472b-bf66-81f800e35b11,2022-07-07,The study will evaluate the immune response an...,{'name': 'Vivli'},"Dataset from An Open, Phase IV Study on the Im...",https://doi.org/10.25934/00000403,
1,VIVLI_0021f33a-7957-4fdc-b17b-c44697a1e4a4,2023-10-23,"This is a multicenter, randomized, open-label,...",{'name': 'Vivli'},"Dataset from A Multicenter, Randomized, Open-l...",https://doi.org/10.25934/PR00008329,
2,VIVLI_002af971-d0f3-46f7-bf23-3e0ff3458224,2022-07-05,This study will assess the efficacy and safety...,{'name': 'Vivli'},Dataset from An Open Label Study to Assess the...,https://doi.org/10.25934/00005745,
3,VIVLI_002bf970-2a5b-4dee-a2ef-1e78e269f132,2023-10-19,The purpose of this study is to determine if L...,{'name': 'Vivli'},Dataset from The Impact of LY2189265 Versus Me...,https://doi.org/10.25934/00004482,
4,VIVLI_0034e4ad-87f6-4bf8-b4e6-08840cf8b5b4,2022-07-07,"GSK2982772 is a first-in-class, highly selecti...",{'name': 'Vivli'},"Dataset from A Three Part, Non-randomized, Ope...",https://doi.org/10.25934/00007333,


In [6]:
def convert_dict_to_list(x):
    if isinstance(x, list):
        return x[0]['name']
    else:
        return x['name']


prod_data['includedInDataCatalog'] = prod_data['includedInDataCatalog'].apply(convert_dict_to_list)

In [7]:
from bs4 import BeautifulSoup

prod_data['Description'] = prod_data['Description'].fillna('').apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

  prod_data['Description'] = prod_data['Description'].fillna('').apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
  prod_data['Description'] = prod_data['Description'].fillna('').apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())


In [8]:
grouped_dataframes = prod_data.groupby('includedInDataCatalog')

dataframes = []
for group, data in grouped_dataframes:
    data['includedInDataCatalog'] = group
    dataframes.append(data)


In [9]:
for df in dataframes:
    dataset_name = df['includedInDataCatalog'].values[0]
    dataset_length = len(df)
    memory_usage_bytes = df.memory_usage(deep=True).sum()
    memory_usage_gb = memory_usage_bytes / (1024 * 1024)
    print(f"Dataset Name: {dataset_name:<35} Number of Items: {dataset_length:<20} Memory usage: {memory_usage_gb:.2f} MB")
    

print("\nTotal Number of Datasets: ", len(dataframes))
print("Total Number of Items: ", len(prod_data))

Dataset Name: AccessClinicalData@NIAID            Number of Items: 7                    Memory usage: 0.03 MB
Dataset Name: ClinEpiDB                           Number of Items: 50                   Memory usage: 0.48 MB
Dataset Name: Data Discovery Engine               Number of Items: 368                  Memory usage: 0.35 MB
Dataset Name: Dryad Digital Repository            Number of Items: 54208                Memory usage: 206.08 MB
Dataset Name: Harvard Dataverse                   Number of Items: 80975                Memory usage: 78.15 MB
Dataset Name: HuBMAP                              Number of Items: 1948                 Memory usage: 1.14 MB
Dataset Name: ImmPort                             Number of Items: 721                  Memory usage: 1.29 MB
Dataset Name: LINCS                               Number of Items: 345                  Memory usage: 0.39 MB
Dataset Name: Mendeley                            Number of Items: 53828                Memory usage: 64.80 MB
Datase

In [10]:
directory = "nde_prod"
if not os.path.exists(directory):
    os.makedirs(directory)

for i, df in enumerate(dataframes):
    dataset_name = df['includedInDataCatalog'].unique()[0]
    filename = f"{directory}/{dataset_name}.csv"
    df.to_csv(filename, index=False, lineterminator='\n')


In [11]:
## Confirming correct number of samples

directory = "nde_prod"
dfs = []

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath, lineterminator='\n')
        dfs.append(df)

for df in dfs:
    dataset_name = df['includedInDataCatalog'].values[0]
    dataset_length = len(df)
    memory_usage_bytes = df.memory_usage(deep=True).sum()
    memory_usage_gb = memory_usage_bytes / (1024 * 1024)
    print(f"Dataset Name: {dataset_name:<35} Number of Items: {dataset_length:<20} Memory usage: {memory_usage_gb:.2f} MB")

print("\nTotal Number of Datasets: ", len(dfs))
total_items = sum(len(df) for df in dfs)
print("Total Number of Items: ", total_items)


Dataset Name: HuBMAP                              Number of Items: 1948                 Memory usage: 1.08 MB
Dataset Name: VEuPathDB                           Number of Items: 3005                 Memory usage: 1.95 MB
Dataset Name: ImmPort                             Number of Items: 721                  Memory usage: 1.28 MB
Dataset Name: NCBI SRA                            Number of Items: 339001               Memory usage: 319.13 MB
Dataset Name: Harvard Dataverse                   Number of Items: 80975                Memory usage: 75.65 MB
Dataset Name: ReframeDB                           Number of Items: 139                  Memory usage: 0.64 MB
Dataset Name: Qiita                               Number of Items: 703                  Memory usage: 1.19 MB
Dataset Name: Vivli                               Number of Items: 7210                 Memory usage: 7.30 MB
Dataset Name: VDJServer                           Number of Items: 65                   Memory usage: 0.13 MB
Dataset

In [39]:
sorted_dfs = sorted(dfs, key=lambda df: len(df))
print("------------------------SORTED------------------------")
for df in sorted_dfs:
    dataset_name = df['includedInDataCatalog'].values[0]
    dataset_length = len(df)
    memory_usage_bytes = df.memory_usage(deep=True).sum()
    memory_usage_gb = memory_usage_bytes / (1024 * 1024)
    print(f"Dataset Name: {dataset_name:<35} Number of Items: {dataset_length:<20} Memory usage: {memory_usage_gb:.2f} MB")

------------------------SORTED------------------------
Dataset Name: AccessClinicalData@NIAID            Number of Items: 7                    Memory usage: 0.03 MB
Dataset Name: ClinEpiDB                           Number of Items: 50                   Memory usage: 0.48 MB
Dataset Name: VDJServer                           Number of Items: 65                   Memory usage: 0.13 MB
Dataset Name: ReframeDB                           Number of Items: 139                  Memory usage: 0.64 MB
Dataset Name: LINCS                               Number of Items: 345                  Memory usage: 0.38 MB
Dataset Name: Data Discovery Engine               Number of Items: 368                  Memory usage: 0.34 MB
Dataset Name: Qiita                               Number of Items: 703                  Memory usage: 1.19 MB
Dataset Name: ImmPort                             Number of Items: 721                  Memory usage: 1.28 MB
Dataset Name: HuBMAP                              Number of Items

## Data Repository Samples

In [25]:
sampled_dataframes = []

for df in dataframes:
    sample_size = min(int(len(df) * 0.1), 25)
    if sample_size == 0:
        sampled_df = df
    else:
        sampled_df = df.sample(n=sample_size)
    print(df['includedInDataCatalog'].values[0], df.shape[0], sampled_df.shape[0])
    sampled_dataframes.append(sampled_df)

combined_df = pd.concat(sampled_dataframes).reset_index(drop=True)[['includedInDataCatalog', 'Name', 'Description']]
combined_df.rename(columns={'includedInDataCatalog': 'Data Repository'}, inplace=True)


AccessClinicalData@NIAID 7 7
ClinEpiDB 50 5
Data Discovery Engine 368 25
Dryad Digital Repository 54208 25
Harvard Dataverse 80975 25
HuBMAP 1948 25
ImmPort 721 25
LINCS 345 25
Mendeley 53828 25
NCBI GEO 210928 25
NCBI SRA 339001 25
Omics Discovery Index (OmicsDI) 1802736 25
Qiita 703 25
ReframeDB 139 13
VDJServer 65 6
VEuPathDB 3005 25
Vivli 7210 25
Zenodo 208043 25


In [29]:
combined_df.to_csv('sampled_data.csv', index=False, lineterminator='\n')