In [None]:
!wget https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_Ubuntu14.04_x64.zip
!unzip gdc-client

In [None]:
!chmod +x gdc-client

In [3]:
import requests
import json
import pandas as pd

def generate_ffpe_h5_manifest_all(project_id="TCGA-COAD",
                                  output_manifest="gdc_ffpe_coad_h5_manifest.tsv",
                                  page_size=500):
    """
    Query GDC for ALL .h5 files from FFPE TCGA-COAD samples, 
    page through results, and write a two‑column manifest TSV.

    Parameters:
    - project_id:      GDC project to query (default "TCGA-COAD")
    - output_manifest: output TSV filename (file_id<TAB>file_name)
    - page_size:       number of records per API page (max 2000)
    """
    endpoint = "https://api.gdc.cancer.gov/files"
    records = []
    offset = 0

    while True:
        # build filter tree
        filters = {
            "op": "and",
            "content": [
                { "op": "in",
                  "content": {
                      "field": "cases.project.project_id",
                      "value": [project_id]
                  }
                },
                { "op": "regexp",
                  "content": {
                      "field": "file_name",
                      "value": "\\.h5$"
                  }
                },
                # { "op": "in",
                #   "content": {
                #       "field": "cases.samples.sample_preservation_method",
                #       "value": ["FFPE"]
                #   }
                # }
            ]
        }

        params = {
            "filters": json.dumps(filters),
            "fields": "file_id,file_name",
            "format": "JSON",
            "size": page_size,
            "from": offset
        }

        resp = requests.get(endpoint, params=params)
        resp.raise_for_status()
        hits = resp.json()["data"]["hits"]
        if not hits:
            break

        for f in hits:
            records.append((f["file_id"], f["file_name"]))

        offset += len(hits)
        print(f"  → fetched {len(records)} records so far…")

    # write TSV
    df = pd.DataFrame(records, columns=["id", "file_name"])
    df.to_csv(output_manifest, sep="\t", index=False)
    print(f"Done: {len(records)} entries written to {output_manifest}")

generate_ffpe_h5_manifest_all()


Done: 0 entries written to gdc_ffpe_coad_h5_manifest.tsv


In [None]:
!gdc-client download -m gdc_ffpe_coad_h5_manifest.tsv -d TCGA_COAD_FFPE_H5