In [2]:
# --- Repo + path setup ---

import os, sys

# 1) Clone repo if it doesn't exist
if not os.path.exists('/content/bdh_challenge_2025'):
    %cd /content
    !git clone https://github.com/arionandrei2000/bdh_challenge_2025.git

# 2) Move into repo root
%cd /content/bdh_challenge_2025
print("PWD:", os.getcwd())
print("Repo contents:", os.listdir())

# 3) src is under notebooks/src, so add that parent folder to sys.path
src_parent = "/content/bdh_challenge_2025/notebooks"
if src_parent not in sys.path:
    sys.path.append(src_parent)

print("sys.path updated; ready to import src.")


/content
Cloning into 'bdh_challenge_2025'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 41 (delta 4), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (41/41), 10.34 KiB | 2.58 MiB/s, done.
Resolving deltas: 100% (4/4), done.
/content/bdh_challenge_2025
PWD: /content/bdh_challenge_2025
Repo contents: ['.git', 'notebooks', 'README.md']
sys.path updated; ready to import src.


In [6]:
# 01) Dependencies

# Core Python packages used in this notebook
# - pandas / numpy: data manipulation
# - pyarrow: for saving Parquet
# - requests: HTTP requests to GDC API
# - tqdm: progress bars
# - pyranges: may be useful later for genomic ranges
!pip install pandas numpy pyarrow requests tqdm pyranges --quiet

# Download GDC client
!wget https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_Ubuntu_x64.zip -O gdc.zip -q
!unzip -o gdc.zip > /dev/null
!chmod +x gdc-client

print("Installed Python deps and downloaded gdc-client.")


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m136.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sorted_nearest (pyproject.toml) ... [?25l[?25hdone


In [24]:

# 02) Mount GDrive
# This lets us store large TCGA data in Drive so it persists across sessions

from google.colab import drive
drive.mount('/content/drive')

print(" Google Drive mounted at /content/drive")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Google Drive mounted at /content/drive


In [25]:
# 03) Project Paths in Google Dribe

from pathlib import Path

# Folder Google Drive where we'll store all TCGA data for this project.
PROJECT_ROOT = Path("/content/drive/MyDrive/bdh_challenge_2025_data")
PROJECT_ROOT.mkdir(exist_ok=True)

# Generic data directory
DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

# Directory where raw TCGA STAR count files will be downloaded
RNA_DIR = PROJECT_ROOT / "tcga_rna"
RNA_DIR.mkdir(exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Data dir    :", DATA_DIR)
print("RNA dir     :", RNA_DIR)



Project root: /content/drive/MyDrive/bdh_challenge_2025_data
Data dir    : /content/drive/MyDrive/bdh_challenge_2025_data/data
RNA dir     : /content/drive/MyDrive/bdh_challenge_2025_data/tcga_rna


In [26]:
# 04) Imports and TCGA cohort

import os
import json
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm

# We explicitly list the TCGA projects we will include. THIS CAN BE CHANGED!

BULK_RNABERT_PROJECTS = [
    "TCGA-BRCA",  # Breast invasive carcinoma
    "TCGA-BLCA",  # Bladder urothelial carcinoma
    "TCGA-GBM",   # Glioblastoma multiforme
    "TCGA-LGG",   # Lower grade glioma
    "TCGA-LUAD",  # Lung adenocarcinoma
    "TCGA-UCEC",  # Uterine corpus endometrial carcinoma
]

print("Cohorts for this run:")
for p in BULK_RNABERT_PROJECTS:
    print(" -", p)


Cohorts for this run:
 - TCGA-BRCA
 - TCGA-BLCA
 - TCGA-GBM
 - TCGA-LGG
 - TCGA-LUAD
 - TCGA-UCEC


In [27]:

# 05) QUERY GDC FOR GENE EXPRESSION (STAR - COUNTS)

# We ask GDC:
#  - data_category: Transcriptome Profiling
#  - data_type: Gene Expression Quantification
#  - projects: the 6 TCGA projects above


files_query = {
    "filters": {
        "op": "and",
        "content": [
            {
                "op": "in",
                "content": {
                    "field": "cases.project.project_id",
                    "value": BULK_RNABERT_PROJECTS,
                },
            },
            {
                "op": "in",
                "content": {
                    "field": "data_category",
                    "value": ["Transcriptome Profiling"],
                },
            },
            {
                "op": "in",
                "content": {
                    "field": "data_type",
                    "value": ["Gene Expression Quantification"],
                },
            },
        ],
    },
    "format": "JSON",
    "size": 20000,
    "fields": (
        "file_id,file_name,"
        "cases.submitter_id,cases.project.project_id,"
        "data_category,data_type,analysis.workflow_type"
    ),
}

resp = requests.post(
    "https://api.gdc.cancer.gov/files",
    headers={"Content-Type": "application/json"},
    data=json.dumps(files_query),
)

resp_json = resp.json()
files_json = resp_json.get("data", {}).get("hits", [])

print("HTTP status:", resp.status_code)
print("Number of files for selected projects:", len(files_json))

if len(files_json) == 0:
    print(" GDC returned 0 files. Pagination/debug:")
    print(json.dumps(resp_json.get("data", {}).get("pagination", {}), indent=2))
    raise RuntimeError("GDC returned 0 files for this query. Adjust filters or check connection.")

# Convert JSON list to DataFrame
files_df = pd.json_normalize(files_json)

def _get_first_case(x):
    """Helper: for 'cases' field that is a list, return the first dict or {}."""
    if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict):
        return x[0]
    return {}

# Extract submitter_id and project_id from nested 'cases' field
files_df["submitter_id"] = files_df["cases"].apply(
    lambda x: _get_first_case(x).get("submitter_id", None)
)
files_df["project_id"] = files_df["cases"].apply(
    lambda x: _get_first_case(x).get("project_id", None)
)

print("Unique workflows in these files:")
print(files_df["analysis.workflow_type"].value_counts())

files_df[
    ["file_id", "file_name", "submitter_id", "project_id",
     "data_category", "data_type", "analysis.workflow_type"]
].head()


HTTP status: 200
Number of files for selected projects: 3777
Unique workflows in these files:
analysis.workflow_type
STAR - Counts    3777
Name: count, dtype: int64


Unnamed: 0,file_id,file_name,submitter_id,project_id,data_category,data_type,analysis.workflow_type
0,9dc09c86-c728-4bd9-b2b6-2d9962dad662,d1f1743c-5fd9-4ae8-90c2-8c3e2d475d1b.rna_seq.a...,TCGA-EW-A2FS,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts
1,95668f0b-130d-44d4-94c0-ba7a4e7798e6,6365a756-2e65-42cb-be4f-1f726915ca94.rna_seq.a...,TCGA-OL-A6VR,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts
2,461fda5d-d6e6-4354-b035-c302cc43b03f,30285113-d411-475a-a8ed-1fb66be72f28.rna_seq.a...,TCGA-E9-A226,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts
3,30ff778c-844b-4140-9025-7ab1938f10a9,5167da8c-2b1c-4139-a2c4-355e9f07d0be.rna_seq.a...,TCGA-A8-A08H,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts
4,427a04c9-9b48-49de-8a47-2adc4e1dd32a,fead73ce-2e66-4647-8b6c-b8bbdaaf30fe.rna_seq.a...,TCGA-D8-A27H,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts


In [30]:
# 06) Create MANIFEST & DOWNLOAD star counts TO GDrive

# GDC client expects a manifest file with an "id" column listing file_ids.

manifest = files_df[["file_id"]].rename(columns={"file_id": "id"})
manifest_path = PROJECT_ROOT / "manifest_star_counts.txt"
manifest.to_csv(manifest_path, sep="\t", index=False)

print("Manifest written to:", manifest_path)
print("Total files listed:", manifest.shape[0])

# Download all files into RNA_DIR (in Google Drive).
# If your internet drops or you interrupt, you can re-run this cell;
# gdc-client will skip files that are already complete.
!./gdc-client download -m {manifest_path} -d {RNA_DIR}

print(" Download finished (check tcga_rna/ in Google Drive).")


Manifest written to: /content/drive/MyDrive/bdh_challenge_2025_data/manifest_star_counts.txt
Total files listed: 3777
100% [#############################################] Time:  0:00:12 339.1 KiB/s 
100% [#############################################] Time:  0:00:12 341.9 KiB/s 
100% [#############################################] Time:  0:00:06 654.6 KiB/s 
100% [#############################################] Time:  0:00:06 613.1 KiB/s 
100% [#############################################] Time:  0:00:08 490.0 KiB/s 
[31mERROR: [0m('Connection aborted.', OSError("(104, 'ECONNRESET')"))
100% [#############################################] Time:  0:00:12 331.6 KiB/s 
100% [#############################################] Time:  0:00:11 361.6 KiB/s 
N/A% [                                               ] ETA:  --:--:--   0.0 s/B [31mERROR: [0mProcess stopped by user.
Process Process-16:
Traceback (most recent call last):
  File "gdc_client-1.6.1-py3.7.egg/gdc_client/parcel/client.py", li

In [33]:
# ============================
# 07) BUILD STAR COUNTS MATRIX (ROBUST + PARTIAL DOWNLOAD OK)
# ============================
# This version:
#  - Ignores non-count files (e.g., annotations.txt)
#  - Treats '#' lines as comments so header is parsed correctly
#  - Works even if only a subset of all files has finished downloading
#  - Optional MAX_FILES to test quickly on fewer samples

# Map file_id → submitter_id (sample ID)
id_to_sample = dict(zip(files_df["file_id"], files_df["submitter_id"]))
print("Number of file_id → sample mappings:", len(id_to_sample))

# Collect all expression file paths currently under RNA_DIR
all_files = []
for root, dirs, files in os.walk(RNA_DIR):
    for f in files:
        # GDC STAR count files are usually .tsv or .txt, but we will filter further below
        if f.endswith(".tsv") or f.endswith(".txt"):
            all_files.append(os.path.join(root, f))

print("Number of .tsv/.txt files on disk:", len(all_files))

# OPTIONAL: limit to a subset for quick tests
MAX_FILES = None  # e.g., set to 50 for debugging; None = use all
if MAX_FILES is not None:
    all_files = all_files[:MAX_FILES]
    print(f"Using only the first {len(all_files)} files for this run.")

gene_index = None
matrix = {}
skipped_no_geneid = []
skipped_non_counts = []

for fpath in tqdm(all_files):
    fname = os.path.basename(fpath)

    # Skip obvious non-count files (e.g., annotations.txt)
    # STAR gene count files typically contain 'rna_seq' and 'gene_counts' in the name.
    if "rna_seq" not in fname or "gene_counts" not in fname:
        skipped_non_counts.append(fname)
        continue

    # Folder name created by gdc-client is the file_id
    file_id = os.path.basename(os.path.dirname(fpath))
    sample = id_to_sample.get(file_id, None)
    if sample is None:
        # No mapping (shouldn't happen often); skip just in case
        skipped_non_counts.append(fname)
        continue

    # Treat '#' lines as comments so the first non-comment line is the header (gene_id, unstranded, ...)
    df = pd.read_csv(fpath, sep="\t", comment="#")

    if "gene_id" not in df.columns:
        skipped_no_geneid.append((fname, df.columns.tolist()))
        continue

    # Choose main count column
    if "unstranded" in df.columns:
        counts_col = "unstranded"
    else:
        numeric_cols = [c for c in df.columns if c != "gene_id"]
        if len(numeric_cols) == 0:
            skipped_no_geneid.append((fname, df.columns.tolist()))
            continue
        counts_col = numeric_cols[0]

    # Drop summary rows like '__no_feature', '__ambiguous', etc.
    df = df[~df["gene_id"].str.startswith("__")].reset_index(drop=True)

    # Initialize or check gene ordering
    if gene_index is None:
        gene_index = df["gene_id"].values
    else:
        if not np.array_equal(gene_index, df["gene_id"].values):
            # If orders differ (rare in harmonized GDC), you could sort by gene_id,
            # but here we assume harmonized outputs and raise.
            raise ValueError(f"Gene order mismatch in file: {fpath}")

    # Store counts for this sample
    matrix[sample] = df[counts_col].values

# Build counts matrix from whatever we actually loaded
expr_counts = pd.DataFrame(matrix, index=gene_index)
print("Counts matrix shape (genes x samples):", expr_counts.shape)
expr_counts.iloc[:5, :5]

print(f"Skipped {len(skipped_non_counts)} non-count/auxiliary files (e.g., annotations).")
print(f"Skipped {len(skipped_no_geneid)} files with no gene_id column.")


Number of file_id → sample mappings: 3777
Number of .tsv/.txt files on disk: 12


100%|██████████| 12/12 [00:02<00:00,  4.94it/s]

Counts matrix shape (genes x samples): (60664, 11)
Skipped 1 non-count/auxiliary files (e.g., annotations).
Skipped 0 files with no gene_id column.



