In [2]:
# --- Repo + path setup ---

import os, sys

# 1) Clone repo if it doesn't exist
if not os.path.exists('/content/bdh_challenge_2025'):
    %cd /content
    !git clone https://github.com/arionandrei2000/bdh_challenge_2025.git

# 2) Move into repo root
%cd /content/bdh_challenge_2025
print("PWD:", os.getcwd())
print("Repo contents:", os.listdir())

# 3) src is under notebooks/src, so add that parent folder to sys.path
src_parent = "/content/bdh_challenge_2025/notebooks"
if src_parent not in sys.path:
    sys.path.append(src_parent)

print("sys.path updated; ready to import src.")


/content
Cloning into 'bdh_challenge_2025'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 41 (delta 4), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (41/41), 10.34 KiB | 2.58 MiB/s, done.
Resolving deltas: 100% (4/4), done.
/content/bdh_challenge_2025
PWD: /content/bdh_challenge_2025
Repo contents: ['.git', 'notebooks', 'README.md']
sys.path updated; ready to import src.


In [6]:
!pip install pandas numpy pyarrow requests tqdm pyranges --quiet
!wget https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_Ubuntu_x64.zip -O gdc.zip -q
!unzip -o gdc.zip > /dev/null
!chmod +x gdc-client


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m136.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sorted_nearest (pyproject.toml) ... [?25l[?25hdone


In [9]:
import os
import json
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
from pathlib import Path

PROJECT_ROOT = Path(".").resolve()

DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

RNA_DIR = PROJECT_ROOT / "tcga_rna"
RNA_DIR.mkdir(exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Data dir    :", DATA_DIR)
print("RNA dir     :", RNA_DIR)


Project root: /content/bdh_challenge_2025
Data dir    : /content/bdh_challenge_2025/data
RNA dir     : /content/bdh_challenge_2025/tcga_rna


In [21]:
from pathlib import Path

PROJECT_ROOT = Path("/content/drive/MyDrive/bdh_challenge_2025")
PROJECT_ROOT.mkdir(exist_ok=True)

DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

RNA_DIR = PROJECT_ROOT / "tcga_rna"
RNA_DIR.mkdir(exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Data dir    :", DATA_DIR)
print("RNA dir     :", RNA_DIR)


Project root: /content/drive/MyDrive/bdh_challenge_2025
Data dir    : /content/drive/MyDrive/bdh_challenge_2025/data
RNA dir     : /content/drive/MyDrive/bdh_challenge_2025/tcga_rna


In [18]:
# ============================
# 04) QUERY GDC FOR GENE EXPRESSION (ALL WORKFLOWS)
# ============================
import json, requests
import pandas as pd

files_query = {
    "filters": {
        "op": "and",
        "content": [
            {
                "op": "in",
                "content": {
                    "field": "cases.project.project_id",
                    "value": BULK_RNABERT_PROJECTS,
                },
            },
            {
                "op": "in",
                "content": {
                    "field": "data_category",
                    "value": ["Transcriptome Profiling"],
                },
            },
            {
                "op": "in",
                "content": {
                    "field": "data_type",
                    "value": ["Gene Expression Quantification"],
                },
            },
        ],
    },
    "format": "JSON",
    "size": 20000,
    "fields": (
        "file_id,file_name,cases.submitter_id,cases.project.project_id,"
        "data_category,data_type,analysis.workflow_type"
    ),
}

resp = requests.post(
    "https://api.gdc.cancer.gov/files",
    headers={"Content-Type": "application/json"},
    data=json.dumps(files_query),
)

resp_json = resp.json()
files_json = resp_json.get("data", {}).get("hits", [])

print("HTTP status:", resp.status_code)
print("Number of files for selected projects:", len(files_json))

if len(files_json) == 0:
    print("⚠️ GDC returned 0 files. Pagination/debug:")
    print(json.dumps(resp_json.get("data", {}).get("pagination", {}), indent=2))
    raise RuntimeError("GDC returned 0 files for this query. Adjust filters or check connection.")

files_df = pd.json_normalize(files_json)

def _get_first_case(x):
    if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict):
        return x[0]
    return {}

files_df["submitter_id"] = files_df["cases"].apply(
    lambda x: _get_first_case(x).get("submitter_id", None)
)
files_df["project_id"] = files_df["cases"].apply(
    lambda x: _get_first_case(x).get("project_id", None)
)

print("Unique workflows:")
print(files_df["analysis.workflow_type"].value_counts())

files_df[
    ["file_id", "file_name", "submitter_id", "project_id",
     "data_category", "data_type", "analysis.workflow_type"]
].head()


HTTP status: 200
Number of files for selected projects: 3777
Unique workflows:
analysis.workflow_type
STAR - Counts    3777
Name: count, dtype: int64


Unnamed: 0,file_id,file_name,submitter_id,project_id,data_category,data_type,analysis.workflow_type
0,9dc09c86-c728-4bd9-b2b6-2d9962dad662,d1f1743c-5fd9-4ae8-90c2-8c3e2d475d1b.rna_seq.a...,TCGA-EW-A2FS,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts
1,95668f0b-130d-44d4-94c0-ba7a4e7798e6,6365a756-2e65-42cb-be4f-1f726915ca94.rna_seq.a...,TCGA-OL-A6VR,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts
2,461fda5d-d6e6-4354-b035-c302cc43b03f,30285113-d411-475a-a8ed-1fb66be72f28.rna_seq.a...,TCGA-E9-A226,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts
3,30ff778c-844b-4140-9025-7ab1938f10a9,5167da8c-2b1c-4139-a2c4-355e9f07d0be.rna_seq.a...,TCGA-A8-A08H,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts
4,427a04c9-9b48-49de-8a47-2adc4e1dd32a,fead73ce-2e66-4647-8b6c-b8bbdaaf30fe.rna_seq.a...,TCGA-D8-A27H,,Transcriptome Profiling,Gene Expression Quantification,STAR - Counts


In [19]:
# ============================
# 05) CREATE MANIFEST & DOWNLOAD STAR COUNTS
# ============================

# GDC manifest expects a column called "id"
manifest = files_df[["file_id"]].rename(columns={"file_id": "id"})
manifest_path = PROJECT_ROOT / "manifest_star_counts.txt"
manifest.to_csv(manifest_path, sep="\t", index=False)

print("Manifest written:", manifest_path)
print("Total files listed:", manifest.shape[0])

# Download all files into RNA_DIR
!./gdc-client download -m {manifest_path} -d {RNA_DIR}

print("✅ Download finished (check tcga_rna/).")


Manifest written: /content/bdh_challenge_2025/manifest_star_counts.txt
Total files listed: 3777
100% [#############################################] Time:  0:00:07 524.4 KiB/s 
100% [#############################################] Time:  0:00:07 522.6 KiB/s 
100% [#############################################] Time:  0:00:06 624.1 KiB/s 
100% [#############################################] Time:  0:00:05 701.5 KiB/s 
100% [#############################################] Time:  0:00:07 524.2 KiB/s 
100% [#############################################] Time:  0:00:10 390.8 KiB/s 
100% [#############################################] Time:  0:00:09 419.4 KiB/s 
100% [#############################################] Time:  0:00:09 421.3 KiB/s 
100% [#############################################] Time:  0:00:06 608.1 KiB/s 
100% [#############################################] Time:  0:00:11 374.6 KiB/s 
100% [#############################################] Time:  0:00:07 570.6 KiB/s 
[31mERROR: 

In [20]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!./gdc-client download -m manifest_star_counts.txt -d {RNA_DIR}


In [None]:
# ============================
# 06) BUILD STAR COUNTS MATRIX
# ============================
# Expect columns: gene_id, unstranded, stranded_first, stranded_second, etc.

# file_id → submitter_id mapping
id_to_sample = dict(zip(files_df["file_id"], files_df["submitter_id"]))
print("Number of file_id → sample mappings:", len(id_to_sample))

all_files = []
for root, dirs, files in os.walk(RNA_DIR):
    for f in files:
        if f.endswith(".tsv") or f.endswith(".txt"):
            all_files.append(os.path.join(root, f))

print("Number of expression files found:", len(all_files))

gene_index = None
matrix = {}

for fpath in tqdm(all_files):
    # Folder name is the file_id
    file_id = os.path.basename(os.path.dirname(fpath))
    sample = id_to_sample.get(file_id, None)
    if sample is None:
        continue

    df = pd.read_csv(fpath, sep="\t")

    if "gene_id" not in df.columns:
        raise ValueError(f"'gene_id' column not found in {fpath}. Got {df.columns.tolist()}")

    # pick main count column
    if "unstranded" in df.columns:
        counts_col = "unstranded"
    else:
        numeric_cols = [c for c in df.columns if c != "gene_id"]
        if len(numeric_cols) == 0:
            raise ValueError(f"No numeric count columns found in {fpath}.")
        counts_col = numeric_cols[0]

    # Drop summary rows like '__no_feature', '__ambiguous', etc.
    df = df[~df["gene_id"].str.startswith("__")].reset_index(drop=True)

    if gene_index is None:
        gene_index = df["gene_id"].values
    else:
        if not np.array_equal(gene_index, df["gene_id"].values):
            raise ValueError(f"Gene order mismatch in file: {fpath}")

    matrix[sample] = df[counts_col].values

expr_counts = pd.DataFrame(matrix, index=gene_index)
print("Counts matrix shape (genes x samples):", expr_counts.shape)
expr_counts.iloc[:5, :5]
