In [1]:
from nbiatoolkit import NBIAClient
import os
import pandas as pd
from dotenv import load_dotenv
from rich import print
from pathlib import Path
from collections import defaultdict
import re

ModuleNotFoundError: No module named 'nbiatoolkit'

# Query the NBIA API

## Environment Variables:
Add this to a `.env` file in your project directory and add your environment variables:

```
NBIA_USERNAME=<username>
NBIA_PASSWORD=<password>
```

NOTE: developed this notebook using bhklab's account credentials 


In [None]:
load_dotenv()
USERNAME= os.environ.get("NBIA_USERNAME")
PASSWORD=os.environ.get("NBIA_PASSWORD")
client = NBIAClient(
  username=USERNAME,
  password=PASSWORD,
  return_type="dataframe",
  log_level="DEBUG"
)

In [None]:
collection_path = Path("data/nbia_collections.tsv")

if collection_path.exists():
    collections = pd.read_csv(collection_path, sep="\t")
else:
    collections = client.getCollectionPatientCount()
    collections.to_csv(collection_path, sep="\t", index=False)

print(collections)

## Grant Table 1: Data

### BHKLAB 

- QIN-HEADNECK
- Head-Neck-PET-CT
- CPTAC-HNSCC
- HNSCC
- HEAD-NECK-RADIOMICS-HN1
- HNSCC-3DCT-RT
- OPC-Radiomics
- TCGA-HNSC
- STRUCTSEG19
- 18F-FDG PET Radiomics Risk Challenge
- HPV Prediction Challenge
- PDDCA
- RADCURE



In [None]:
bhklab = [
    "QIN-HEADNECK",
    "Head-Neck-PET-CT",
    "CPTAC-HNSCC",
    "HNSCC",
    "HEAD-NECK-RADIOMICS-HN1",
    "HNSCC-3DCT-RT",
    "TCGA-HNSC",
    "RADCURE",
    "OPC-Radiomics",
    "STRUCTSEG19",
    "18F-FDG PET Radiomics Risk Challenge",
    "HPV Prediction Challenge",
    "PDDCA",
]
wanglab = [
    "CPTAC-CCRCC",
    "CPTAC-PDA",
    "CPTAC-UCEC",
    "CT Lymph Nodes",
    "TCGA-BLCA",
    "TCGA-KIRC",
    "TCGA-LIHC",
    "TCGA-OV",
    "TCGA-STAD",
    "Pancreas-CT",
    "ct org",
    "KiTS",
    "Pancreatic-CT-CBCT-SEG",
    "CPTAC-SAR",
    "TCGA-KICH",
    "TCGA-KIRP"
]



missing_from_tcia = defaultdict(list)

# Continue from the last code cell
for dataset in bhklab:
    if not collections['Collection'].str.contains(dataset, case=False).any():
        missing_from_tcia['bhklab'].append(dataset)

for dataset in wanglab:
    if not collections['Collection'].str.contains(dataset, case=False).any():
        missing_from_tcia['wanglab'].append(dataset)

# Print the datasets that are missing
print("Datasets not found in TCIA collections:")
print(missing_from_tcia)


## Notes :

### OPC-Radiomics

Note from website previously on TCIA: 


> This collection has been deprecated.
> Data from the collection formerly called OPC-Radiomics has been updated. 
> The data are downloadable but no longer viewable in the Cancer Imaging Archive. 
> Please view the RADCURE page to obtain access to the updated data: https://doi.org/10.7937/J47W-NM11.


**Data Location**: 
`/cluster/projects/radiomics/PublicDatasets/HeadNeck/TCIA-OPC`
`/cluster/projects/radiomics/PublicDatasets/HeadNeck/TCIA_OPC-Radiomics`

**Source Link**: [https://www.cancerimagingarchive.net/collection/opc-radiomics/](https://www.cancerimagingarchive.net/collection/opc-radiomics/)

**Institution**: "TCIA"

### STRUCTSEG19

**Data Location**: `/cluster/projects/radiomics/PublicDatasets/HeadNeck/MICCAI_2019_STRUCTURESEG19`

**Source Link**: [https://structseg2019.grand-challenge.org](https://structseg2019.grand-challenge.org)

**Institution**: "MICCAI / Zhejiang Cancer Hospital"



### 18F-FDG PET Radiomics Risk Challenge

**Data Location**: `/cluster/projects/radiomics/PublicDatasets/HeadNeck/MICCAI_2018_18F-FDG_PET_Radiomics_Risk_Challenge`

**Source Link**:

**Institution**: MICCAI / multi-institution

###  HPV Prediction Challenge

`MICCAI_2016_HPV_Prediction_Challenge` 

**Data Location**: `/cluster/projects/radiomics/PublicDatasets/HeadNeck/MICCAI_2016_HPV_Prediction_Challenge`

**Source Link**:

**Institution**: "MD Anderson CC / MICCAI"


### PDDCA

`A Public Domain Database for Computational Anatomy (PDDCA)`

**Data Location**: `/cluster/projects/radiomics/PublicDatasets/HeadNeck/A Public Domain Database for Computational Anatomy(PDDCA)`

**Source Link**: [https://www.imagenglab.com/newsite/pddca/](https://www.imagenglab.com/newsite/pddca/)

**Institution**: "Harvard / MICCAI"


In [None]:
# Subset collection dataframe to only include our desired datasets
datasets = bhklab + wanglab


collections = collections[collections['Collection'].str.contains('|'.join(datasets), case=False)].copy()
collections['mycollection_name'] = collections['Collection'].str.extract('(' + '|'.join(datasets) + ')', flags=re.IGNORECASE)
collections.reset_index(drop=True, inplace=True)
collections

In [None]:
series_lists = []
series_metadata_lists = []


def clean_name(name):
    return name.replace(" ", "_").replace("-", "_")


data_path = Path("data") / "collections"
data_path.mkdir(exist_ok=True, parents=True)


for i, dataset in enumerate(collections.Collection.unique()):

    file_path = data_path / f"{clean_name(dataset)}.tsv"

    if file_path.exists():
        series = pd.read_csv(file_path, sep="\t")
    else:  
        print(f"Would need to get series for {dataset}")
        series = client.getSeries(Collection=dataset)
        series.to_csv(file_path, sep="\t", index=False)

    unique_modalities = series.Modality.unique()
    body_parts = []
    for modality in unique_modalities:
        modality_series = series[series.Modality == modality]
        
        if "BodyPartExamined" in modality_series.columns:
            # add to body_parts set
            body_parts.extend(list(modality_series.BodyPartExamined.dropna().unique())) 

        metadata = {
                "Collection": dataset,
                "TotalPatients": len(series.PatientID.unique()),
                "BodyPartExamined": ", ".join(set(body_parts)),
                "Modality": modality,
                "PatientPerModality": len(modality_series.PatientID.unique()),
                "SeriesPerModality": len(modality_series),
                "totalSizeGB": round(sum(modality_series.FileSize) / 1024 ** 3, 2),
        }
        series_lists.append(metadata)
    del series

# create a dataframe from the list of dictionaries
series_metadata_df = pd.DataFrame(series_lists)
series_metadata_df.to_csv("data/series_metadata_df.tsv", index=False, header=True, sep="\t")

In [None]:
import pandas as pd

dataframes = []
for file in (data_path).glob("*.tsv"):
    df = pd.read_csv(file, sep='\t')
    dataframes.append(df)

concatenated_df = pd.concat(dataframes, axis=0, ignore_index=True, sort=False)
concatenated_df = concatenated_df.fillna('N/A')
concatenated_df.sort_values(by=["Collection", "PatientID"], inplace=True)
concatenated_df

In [None]:
"""
'SeriesInstanceUID', 'StudyInstanceUID', 'Modality', 'SeriesDate',
       'SeriesDescription', 'BodyPartExamined', 'SeriesNumber', 'Collection',
       'PatientID', 'ImageCount', 'TimeStamp', 'LicenseName', 'LicenseURI',
       'CollectionURI', 'FileSize', 'DateReleased', 'StudyDesc', 'StudyDate',
       'ThirdPartyAnalysis', 'SoftwareVersions', 'Manufacturer',
       'ManufacturerModelName', 'ProtocolName', 'AnnotationsFlag'
"""

columns_of_interest = [
    "Collection",
    "PatientID",
    "StudyInstanceUID",
    "Modality",
    "SeriesInstanceUID",
    "SeriesNumber",
    "SeriesDate",
    "BodyPartExamined",
    "ImageCount",
    "TimeStamp",
    "CollectionURI",
    "FileSize",
    "DateReleased",
]

subset_df = concatenated_df[columns_of_interest]
subset_df.reset_index(drop=True, inplace=True)
subset_df.to_csv("data/AllSeries.tsv", sep="\t", index=False)