#### Early Onset Colorectal Cancer

In [1]:
import os
import json
import time
from pathlib import Path
from typing import List, Dict, Any, Tuple
import requests
import pandas as pd
import numpy as np
import pickle
import tarfile, gzip, shutil
from numpy.linalg import norm
from scipy.stats import mannwhitneyu
import matplotlib.pyplot as plt
from tqdm import tqdm
import zipfile



In [2]:
GDC_API = "https://api.gdc.cancer.gov"
PROJECTS = ["TCGA-COAD", "TCGA-READ"] # colon + rectum adenocarcinoma
PAGE_SIZE = 200

SESSION = requests.Session()
SESSION.headers.update({"Content-Type": "application/json"})

##### Pull data & check outputs

In [4]:
## Handle pagination in GDC's API

def _paged_post(endpoint: str, payload: dict, page_size: int = PAGE_SIZE):
    """Yield all records from a paginated GDC API POST request."""
    url, offset = f"{GDC_API}/{endpoint}", 0
    while True:
        r = SESSION.post(url, params={"size": page_size, "from": offset},
                         data=json.dumps(payload), timeout=120)
        r.raise_for_status()
        hits = r.json().get("data", {}).get("hits", [])
        if not hits:
            break
        yield from hits
        offset += page_size
        if offset >= r.json().get("data", {}).get("pagination", {}).get("total", 0):
            break

In [5]:
## Query GDC for list of all open-access mutation files (MAF) for COAD+READ

def find_open_maf_files(projects=None):
    """
    Fetch open-access masked somatic mutation (MAF) files for TCGA projects (COAD/READ by default).
    Returns a DataFrame with file, case, and sample metadata.
    """
    projects = projects or PROJECTS

    # Build filters
    filters = {
        "op": "and",
        "content": [
            {"op": "in", "content": {"field": f, "value": v}}
            for f, v in [
                ("cases.project.project_id", projects),
                ("files.data_category", ["Simple Nucleotide Variation"]),
                ("files.data_type", ["Masked Somatic Mutation"]),
                ("files.experimental_strategy", ["WXS"]),
                ("files.access", ["open"])
            ]
        ],
    }

    fields = [
        "file_id", "file_name", "md5sum", "created_datetime", "updated_datetime",
        "analysis.workflow_type",
        "cases.case_id", "cases.submitter_id", "cases.project.project_id",
        "cases.samples.sample_id", "cases.samples.sample_type", "cases.samples.submitter_id"
    ]
    payload = {"filters": filters, "fields": ",".join(fields), "format": "JSON"}

    # Collect all hits
    rows = []
    for hit in _paged_post("files", payload):
        base = {
            "file_id": hit.get("file_id"),
            "file_name": hit.get("file_name"),
            "md5sum": hit.get("md5sum"),
            "created_datetime": hit.get("created_datetime"),
            "updated_datetime": hit.get("updated_datetime"),
            "workflow_type": hit.get("analysis", {}).get("workflow_type"),
        }
        for case in hit.get("cases", []):
            for s in case.get("samples", [{}]):  # ensure at least one row
                rows.append({
                    **base,
                    "case_id": case.get("case_id"),
                    "case_submitter_id": case.get("submitter_id"),
                    "project": case.get("project", {}).get("project_id"),
                    "sample_id": s.get("sample_id"),
                    "sample_submitter_id": s.get("submitter_id"),
                    "sample_type": s.get("sample_type"),
                })

    return pd.DataFrame(rows).drop_duplicates(["file_id", "sample_id", "case_id"]).reset_index(drop=True)

In [6]:
## Use '/data' bult endpoint to actually download MAFs by GDC file_id
## MAFs are very large, use manifest metadata first to select subset of MAFs to download

def download_files(file_ids, out_dir: str = "data/maf"):
    """
    Download one or more files from the GDC /data endpoint into out_dir.
    Handles batching, filenames, and safe writes. Returns list of file paths.
    """
    os.makedirs(out_dir, exist_ok=True)
    out_dir = Path(out_dir)
    paths = []

    for i in range(0, len(file_ids), 20):  # fetch in small batches
        chunk = file_ids[i:i+20]
        url = f"{GDC_API}/data/{','.join(chunk)}"
        r = SESSION.get(url, stream=True, timeout=600)
        r.raise_for_status()

        fname = r.headers.get("Content-Disposition", "").split("filename=")[-1].strip('"') \
                or (f"gdc_download_{i}.tar.gz" if len(chunk) > 1 else f"{chunk[0]}.maf.gz")

        fpath = out_dir / fname
        with open(fpath, "wb") as f:
            for b in r.iter_content(1 << 20):  # 1 MB chunks
                if b: f.write(b)

        paths.append(fpath)
        time.sleep(0.5)

    return paths

In [7]:
## Use the 'cases' endpoint to retrieve age at diagnosis (in days, convert to years)

def fetch_case_ages(projects=None):
    """
    Fetch mean age_at_diagnosis (years) per case for given TCGA projects.
    Returns: DataFrame [case_id, submitter_id, project, age_at_diagnosis_years].
    """
    projects = projects or PROJECTS

    filters = {
        "op": "and",
        "content": [
            {"op": "in", "content": {"field": "project.project_id", "value": projects}},
            {"op": "exists", "content": {"field": "diagnoses.age_at_diagnosis"}},
        ],
    }
    fields = [
        "case_id", "submitter_id", "project.project_id", "diagnoses.age_at_diagnosis"
    ]
    payload = {"filters": filters, "fields": ",".join(fields), "format": "JSON", "expand": "diagnoses"}

    rows = [
        {
            "case_id": h["case_id"],
            "submitter_id": h.get("submitter_id"),
            "project": h["project"]["project_id"],
            "age_at_diagnosis_years": (
                dx["age_at_diagnosis"] / 365.25 if isinstance(dx.get("age_at_diagnosis"), (int, float)) else None
            ),
        }
        for h in _paged_post("cases", payload)
        for dx in h.get("diagnoses", [])
    ]

    return (
        pd.DataFrame(rows)
        .groupby(["case_id", "submitter_id", "project"], as_index=False)
        .agg(age_at_diagnosis_years=("age_at_diagnosis_years", "mean"))
    )

In [8]:
## Categorize early/late stages

def label_early_late(age_years: float) -> str:
    """
    Categorize age into early / late / unknown onset groups.
    """
    if pd.isna(age_years):
        return "unknown"
    if age_years <= 40:
        return "early_onset"
    if age_years >= 70:
        return "late_onset"
    return "unknown"

def build_age_table():
    """
    Fetch patient ages and label each as early/late/unknown onset.
    Returns DataFrame with: case_id, submitter_id, project,
    age_at_diagnosis_years, group
    """
    cases = fetch_case_ages()
    cases["group"] = cases["age_at_diagnosis_years"].apply(label_early_late)
    return cases