In [1]:
import json
import pandas as pd
import io
from random import randint
import webbrowser
import numpy as np
import matplotlib.pyplot as plt
from random import seed
from random import sample
import requests

In [2]:
fields = ["file_name",
          "cases.case_id",
          "cases.primary_site",
          "cases.diagnoses.primary_diagnosis",
          "cases.project.disease_type",
          "cases.samples.is_ffpe",
          "md5sum",
          "file_size",
          "state",
          "cases.diagnoses.tissue_or_organ_of_origin",
          "cases.diagnoses.morphology",
          "experimental_strategy"]

In [3]:
fields = ["file_name",
          "cases.case_id",
          "cases.primary_site",
          "cases.diagnoses.primary_diagnosis",
          "cases.project.disease_type",
          "cases.samples.is_ffpe",
          "md5sum",
          "file_size",
          "state",
          "cases.diagnoses.tissue_or_organ_of_origin",
          "cases.diagnoses.morphology",
          "experimental_strategy"]

fields = ",".join(fields)

files_endpt = "https://api.gdc.cancer.gov/files"

# This set of filters is nested under an 'and' operator.
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Lung","Colorectal", "Prostate"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["SVS"]
            }
        }
    ]
}
# filters = {
#        "op": "=",
#        "content":{
#             "field": "files.data_format",
#             "value": ["SVS"]
#        }
#     }

# A POST is used, so the filter parameters can be passed directly as a Dict object.
params = {
    "filters": filters,
    "fields": fields,
    "format": "CSV",
    "size": "50000"
    }

# The parameters are passed to 'json' rather than 'params' in this case
response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)

In [4]:
response

<Response [200]>

In [5]:
urlData = response.content
rawData = pd.read_csv(io.StringIO(urlData.decode('utf-8')), error_bad_lines=False)
print(rawData.shape)

(6364, 14)


In [6]:
rawData['file_size'].apply(lambda x: x*1e-12).sum()

2.406884266529

In [7]:
(rawData["experimental_strategy"].unique())

array(['Tissue Slide', 'Diagnostic Slide'], dtype=object)

In [8]:
rawData

Unnamed: 0,cases.0.project.disease_type,cases.0.samples.0.is_ffpe,cases.0.diagnoses.0.morphology,file_name,md5sum,cases.0.diagnoses.0.primary_diagnosis,cases.0.primary_site,cases.0.samples.1.is_ffpe,state,file_size,cases.0.diagnoses.0.tissue_or_organ_of_origin,cases.0.case_id,id,experimental_strategy
0,Lung Squamous Cell Carcinoma,False,8070/3,TCGA-77-8153-01A-01-TS1.f808f5c4-36d7-4ef7-ac8...,69f60388d4fa95ea1e09bcda8aea0375,"Squamous cell carcinoma, NOS",Bronchus and lung,,released,220927629,"Lower lobe, lung",3336ee77-a6f6-4d3a-93f9-585ebc48ffba,e29cb891-954f-4f98-8c62-e67465f67a3c,Tissue Slide
1,Lung Squamous Cell Carcinoma,False,8070/3,TCGA-56-8622-11A-01-TS1.47057b2f-3b19-4f6d-bff...,44e8644bc086c721c86557d99290b9b7,"Squamous cell carcinoma, NOS",Bronchus and lung,,released,112296195,"Lower lobe, lung",e731fed6-be23-4b19-8550-6e500613c468,b3acafb9-7abe-478a-9795-a09d31229f03,Tissue Slide
2,Lung Squamous Cell Carcinoma,False,8070/3,TCGA-37-4130-01A-01-TS1.c9786e01-c7e1-4ed3-a2b...,ec273d9f759de29ba62a7bc0af05c336,"Squamous cell carcinoma, NOS",Bronchus and lung,,released,139886571,"Upper lobe, lung",ac89e063-a288-4516-9d9e-92cc0ed2bc24,5dfcdf0d-eeb2-488f-b1d7-daf063131e62,Tissue Slide
3,Lung Squamous Cell Carcinoma,True,8070/3,TCGA-52-7810-01Z-00-DX1.54fcdfff-c1ac-4427-bb3...,8eecb830331230e7a90c41b22310f48b,"Squamous cell carcinoma, NOS",Bronchus and lung,,released,1028326939,Overlapping lesion of lung,98b86f94-9ede-45ab-ba09-19a6ec6d9494,b5cf01fd-ab75-4a4e-907d-e30d2b263258,Diagnostic Slide
4,Lung Adenocarcinoma,False,8140/3,TCGA-49-6767-01A-01-BS1.cf0fa204-1f31-4935-bea...,d4b40349c432636e80e43049eb4341bb,"Adenocarcinoma, NOS",Bronchus and lung,,released,176966653,"Middle lobe, lung",6bffe800-ec2b-4638-9333-97fe85dcd91c,b984b76a-0fad-4491-a97c-2f198b3e9833,Tissue Slide
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6359,Prostate Adenocarcinoma,False,8140/3,TCGA-EJ-8472-11A-01-TS1.b0b6f52b-282a-48a7-ba2...,95d6098bef7c74ba779e77a9f4678bf8,"Adenocarcinoma, NOS",Prostate gland,,released,283658811,Prostate gland,ab378ad0-1881-403f-9e85-d6f97e7b7c81,292a61c0-2fad-4498-8a5b-18a19434d871,Tissue Slide
6360,Prostate Adenocarcinoma,True,8140/3,TCGA-XJ-A9DI-01Z-00-DX1.0A4BD633-9872-45EC-981...,0eae4bead7e4ebd51eedf6fc526897c6,"Adenocarcinoma, NOS",Prostate gland,,released,770075443,Prostate gland,39a5aebe-acad-4141-aa5a-66228c4049f8,ae51c4cb-cc6c-4018-a97b-f3a7ae9f9696,Diagnostic Slide
6361,Prostate Adenocarcinoma,True,8140/3,TCGA-EJ-5516-01Z-00-DX1.50003728-b68b-4e22-96d...,397730c71f4209232c66d70b1c9a8052,"Adenocarcinoma, NOS",Prostate gland,,released,545394905,Prostate gland,1fd06207-54e0-4aba-819c-c7f671871f39,7cfaa5d5-a624-469e-bb89-83cca2139dc4,Diagnostic Slide
6362,Prostate Adenocarcinoma,False,8140/3,TCGA-G9-6384-01A-01-BS1.6ce38e25-bc4c-4bf9-9a2...,f59fbbe89ee6bf219fe0edacc8ee43f4,"Adenocarcinoma, NOS",Prostate gland,,released,235103809,Prostate gland,a279d5aa-51f7-4e75-9e14-b206b97ec4bd,52509164-c0f2-4cdd-b92b-4b9493b9f20f,Tissue Slide


In [10]:
for col in rawData.columns: 
    print(col)

In [13]:
rawData.iloc[0].file_name

'TCGA-77-8153-01A-01-TS1.f808f5c4-36d7-4ef7-ac80-a82b1f8f2ecf.svs'

In [17]:
rawData.query("file_name == 'TCGA-A6-3810-01B-03-BS3.A7C6B755-37C3-4E01-9FA8-93787D9FB702.svs'")

Unnamed: 0,cases.0.project.disease_type,cases.0.samples.0.is_ffpe,cases.0.diagnoses.0.morphology,file_name,md5sum,cases.0.diagnoses.0.primary_diagnosis,cases.0.primary_site,cases.0.samples.1.is_ffpe,state,file_size,cases.0.diagnoses.0.tissue_or_organ_of_origin,cases.0.case_id,id,experimental_strategy
3494,Colon Adenocarcinoma,True,8140/3,TCGA-A6-3810-01B-03-BS3.A7C6B755-37C3-4E01-9FA...,1aeb0f3d9f76196f218ee3c2137edf6d,"Adenocarcinoma, NOS",Colon,,released,199885975,"Colon, NOS",b728e6e8-a0e4-496b-904c-6f07c12e901f,e4b21970-ba7d-43e9-a3ea-48bd2710d605,Tissue Slide
