In [91]:
from nbiatoolkit import NBIAClient
import os
import pandas as pd
from dotenv import load_dotenv
from rich import print

# Query the NBIA API

## Environment Variables:
Add this to a `.env` file in your project directory and add your environment variables:

```
NBIA_USERNAME=<username>
NBIA_PASSWORD=<password>
```

NOTE: developed this notebook using bhklab's account credentials 


In [9]:
load_dotenv()
USERNAME= os.environ.get("NBIA_USERNAME")
PASSWORD=os.environ.get("NBIA_PASSWORD")
client = NBIAClient(
  username=USERNAME,
  password=PASSWORD,
  return_type="dataframe",
  log_level="DEBUG"
)

24-11-14 14:21 | NBIAClient | DEBUG | Setting up OAuth2 client... with username bhaibeka


In [10]:
collections = client.getCollectionPatientCount()
print(collections)

24-11-14 14:21 | NBIAClient | DEBUG | Querying API endpoint: https://services.cancerimagingarchive.net/nbia-api/services/getCollectionValuesAndCounts
24-11-14 14:21 | NBIAClient | DEBUG | Query parameters: {}


                        Collection PatientCount
0                          4D-Lung           20
1                       ACRIN-6698          385
2    ACRIN-Contralateral-Breast-MR          984
3                 ACRIN-FLT-Breast           83
4              ACRIN-NSCLC-FDG-PET          242
..                             ...          ...
131                      TCGA-UCEC           65
132                      UPENN-GBM          630
133                         VICTRE         2994
134    Vestibular-Schwannoma-MC-RC          124
135      Vestibular-Schwannoma-SEG          242

[136 rows x 2 columns]


## Grant Table 1: Data

### BHKLAB 

- QIN-HEADNECK
- Head-Neck-PET-CT
- CPTAC-HNSCC
- HNSCC
- HEAD-NECK-RADIOMICS-HN1
- HNSCC-3DCT-RT
- OPC-Radiomics
- TCGA-HNSC
- STRUCTSEG19
- 18F-FDG PET Radiomics Risk Challenge
- HPV Prediction Challenge
- PDDCA
- RADCURE



In [None]:
datasets = [
    "QIN-HEADNECK",
    "Head-Neck-PET-CT",
    "CPTAC-HNSCC",
    "HNSCC",
    "HEAD-NECK-RADIOMICS-HN1",
    "HNSCC-3DCT-RT",
    "TCGA-HNSC",
    "RADCURE"
    "OPC-Radiomics",
    "STRUCTSEG19",
    "18F-FDG PET Radiomics Risk Challenge",
    "HPV Prediction Challenge",
    "PDDCA",
]

missing_from_tcia = []

# Continue from the last code cell
for dataset in datasets:
    if not collections['Collection'].str.contains(dataset, case=False).any():
        missing_from_tcia.append(dataset)

# Print the datasets that are missing
print("Datasets not found in TCIA collections:")
for dataset in missing_from_tcia:
    print(dataset)



Datasets not found in TCIA collections:
OPC-Radiomics
STRUCTSEG19
18F-FDG PET Radiomics Risk Challenge
HPV Prediction Challenge
PDDCA


## Notes :

### OPC-Radiomics

Note from website previously on TCIA: 


> This collection has been deprecated.
> Data from the collection formerly called OPC-Radiomics has been updated. 
> The data are downloadable but no longer viewable in the Cancer Imaging Archive. 
> Please view the RADCURE page to obtain access to the updated data: https://doi.org/10.7937/J47W-NM11.


**Data Location**: 
`/cluster/projects/radiomics/PublicDatasets/HeadNeck/TCIA-OPC`
`/cluster/projects/radiomics/PublicDatasets/HeadNeck/TCIA_OPC-Radiomics`

**Source Link**: [https://www.cancerimagingarchive.net/collection/opc-radiomics/](https://www.cancerimagingarchive.net/collection/opc-radiomics/)

**Institution**: "TCIA"

### STRUCTSEG19

**Data Location**: `/cluster/projects/radiomics/PublicDatasets/HeadNeck/MICCAI_2019_STRUCTURESEG19`

**Source Link**: [https://structseg2019.grand-challenge.org](https://structseg2019.grand-challenge.org)

**Institution**: "MICCAI / Zhejiang Cancer Hospital"



### 18F-FDG PET Radiomics Risk Challenge

**Data Location**: `/cluster/projects/radiomics/PublicDatasets/HeadNeck/MICCAI_2018_18F-FDG_PET_Radiomics_Risk_Challenge`

**Source Link**:

**Institution**: MICCAI / multi-institution

###  HPV Prediction Challenge

`MICCAI_2016_HPV_Prediction_Challenge` 

**Data Location**: `/cluster/projects/radiomics/PublicDatasets/HeadNeck/MICCAI_2016_HPV_Prediction_Challenge`

**Source Link**:

**Institution**: "MD Anderson CC / MICCAI"


### PDDCA

`A Public Domain Database for Computational Anatomy (PDDCA)`

**Data Location**: `/cluster/projects/radiomics/PublicDatasets/HeadNeck/A Public Domain Database for Computational Anatomy(PDDCA)`

**Source Link**: [https://www.imagenglab.com/newsite/pddca/](https://www.imagenglab.com/newsite/pddca/)

**Institution**: "Harvard / MICCAI"


In [17]:
# Subset collection dataframe to only include our desired datasets

collections = collections[collections['Collection'].str.contains('|'.join(datasets), case=False)]
collections.reset_index(drop=True, inplace=True)
collections

Unnamed: 0,Collection,PatientCount
0,CPTAC-HNSCC,133
1,HEAD-NECK-RADIOMICS-HN1,137
2,HNSCC,627
3,HNSCC-3DCT-RT,31
4,Head-Neck-PET-CT,298
5,QIN-HEADNECK,279
6,RADCURE,3346
7,TCGA-HNSC,227


In [131]:
series_lists = []
series_metadata_lists = []
for i, dataset in enumerate(collections.Collection.unique()):
    series = client.getSeries(Collection=dataset)

    unique_modalities = series.Modality.unique()
    
    for modality in unique_modalities:
        modality_series = series[series.Modality == modality]
        
        if "BodyPartExamined" in modality_series.columns:
            bpe = list(modality_series.BodyPartExamined.dropna().unique())
            bpe.sort()
        else:
            bpe = []

        metadata = {
                "Collection": dataset,
                "TotalPatients": len(series.PatientID.unique()),
                "BodyPartExamined": ",".join(bpe),
                "Modality": modality,
                "PatientPerModality": len(modality_series.PatientID.unique()),
                "SeriesPerModality": len(modality_series),
                "totalSizeGB": round(sum(modality_series.FileSize) / 1024 ** 3, 2),
        }
        series_lists.append(metadata)

# create a dataframe from the list of dictionaries
series_metadata_df = pd.DataFrame(series_lists)
series_metadata_df.to_csv("series_metadata_df.tsv", index=False, header=True, sep="\t")

24-11-14 15:38 | NBIAClient | DEBUG | Parsing params: {'self': <nbiatoolkit.nbia.NBIAClient object at 0x1355c9b50>, 'Collection': 'CPTAC-HNSCC', 'PatientID': '', 'StudyInstanceUID': '', 'Modality': '', 'SeriesInstanceUID': '', 'BodyPartExamined': '', 'ManufacturerModelName': '', 'Manufacturer': '', 'return_type': None, 'returnType': <ReturnType.DATAFRAME: 'dataframe'>}
24-11-14 15:38 | NBIAClient | DEBUG | Querying API endpoint: https://services.cancerimagingarchive.net/nbia-api/services/v2/getSeries
24-11-14 15:38 | NBIAClient | DEBUG | Query parameters: {'Collection': 'CPTAC-HNSCC', 'returnType': <ReturnType.DATAFRAME: 'dataframe'>}
24-11-14 15:38 | NBIAClient | DEBUG | Parsing params: {'self': <nbiatoolkit.nbia.NBIAClient object at 0x1355c9b50>, 'Collection': 'HEAD-NECK-RADIOMICS-HN1', 'PatientID': '', 'StudyInstanceUID': '', 'Modality': '', 'SeriesInstanceUID': '', 'BodyPartExamined': '', 'ManufacturerModelName': '', 'Manufacturer': '', 'return_type': None, 'returnType': <ReturnTyp