In [18]:
import os, re, pandas as pd
import archs4py as a4

# -------------------------------------------------------------
# CONFIGURATION
# -------------------------------------------------------------
file = "./data/archs4/human_gene_v2.5.h5"
output_dir = "./data/archs4/"
os.makedirs(output_dir, exist_ok=True)

# Define search pattern: all cancer-related samples
pattern = "cancer|tumor|carcinoma|leukemia|lymphoma|melanoma|glioma"

# -------------------------------------------------------------
# 1Ô∏è‚É£ LOAD METADATA
# -------------------------------------------------------------
print("üìÑ Loading metadata from ARCHS4...")

meta = a4.meta.meta(
    file,
    pattern,
    meta_fields=["geo_accession", "characteristics_ch1", "source_name_ch1"]
)

print(f"‚úÖ Retrieved {len(meta):,} total samples matching cancer-related keywords.")

# Create a text field to help detect cancer type keywords
meta["text"] = (meta["characteristics_ch1"].fillna("") + " " +
                meta["source_name_ch1"].fillna("")).str.lower()

# Quick preview
meta.head()


üìÑ Loading metadata from ARCHS4...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:08<00:00,  2.77s/it]


‚úÖ Retrieved 179,216 total samples matching cancer-related keywords.


Unnamed: 0,geo_accession,characteristics_ch1,source_name_ch1,text
GSM1013480,GSM1013480,"cell line: HL60,rna fraction: Total RNA",Myeloid Leukemia,"cell line: hl60,rna fraction: total rna myeloi..."
GSM1013481,GSM1013481,"cell line: HL60,rna fraction: Non-polyadenylat...",Myeloid Leukemia,"cell line: hl60,rna fraction: non-polyadenylat..."
GSM1019735,GSM1019735,"cell line: HT29,cell type: colon cancer,treatm...",HT29 treated with 0 √é¬ºM of 5-Aza,"cell line: ht29,cell type: colon cancer,treatm..."
GSM1019736,GSM1019736,"cell line: HT29,cell type: colon cancer,treatm...",HT29 treated with 0 √é¬ºM of 5-Aza,"cell line: ht29,cell type: colon cancer,treatm..."
GSM1019737,GSM1019737,"cell line: HT29,cell type: colon cancer,treatm...",HT29 treated with 0 √é¬ºM of 5-Aza,"cell line: ht29,cell type: colon cancer,treatm..."


In [19]:
# -------------------------------------------------------------
# 2Ô∏è‚É£ ASSIGN CANCER TYPE LABELS
# -------------------------------------------------------------
print("üè∑Ô∏è  Assigning cancer type labels...")

# Define regex patterns for each cancer type
# -------------------------------------------------------------
# üß¨ Unified TCGA-style regex map (cleaned + consolidated)
# -------------------------------------------------------------
tcga_map = {
    # Adrenal
    "ACC":   r"adrenal",

    # Bladder
    "BLCA":  r"bladder|urothelial",

    # Breast
    "BRCA":  r"breast|mda[-]?mb|sum149|ductal",

    # Cervix
    "CESC":  r"cervix|cervical",

    # Bile duct / cholangiocarcinoma
    "CHOL":  r"bile|cholangio",

    # Colon / rectum
    "COAD":  r"colon|colorectal|sigmoid|ht29",
    "READ":  r"rectum|rectal",

    # Lymphoid malignancies
    "DLBC":  r"lymphoma|dlbcl|b[- ]cell|t[- ]cell|cll|plasma cell|myeloma",

    # Esophagus
    "ESCA":  r"esophagus|esophageal",

    # Brain / glioma
    "GBM":   r"glioblastoma|gbm|g477",
    "LGG":   r"glioma|astrocyt|oligodendro|meningioma|brain tumor",

    # Head & neck
    "HNSC":  r"head|neck|oral|tongue|pharynx",

    # Kidney
    "KICH":  r"chromophobe",
    "KIRC":  r"clear cell",
    "KIRP":  r"papillary kidney",

    # Leukemia
    "LAML":  r"leukemia|acute myeloid|aml|cml|k562|bcr[-]?abl|hl60|mll",

    # Liver
    "LIHC":  r"liver|hcc|hepatocellular|liver tumor",

    # Lung
    "LUAD":  r"lung|a549|h1299|h1975|nsclc|adenocarcinoma lung|luad",
    "LUSC":  r"squamous lung|lusc|squamous cell carcinoma|epidermoid|a431",

    # Mesothelioma
    "MESO":  r"mesothelioma",

    # Ovary
    "OV":    r"ovary|ovarian|hgsc|aocs1|serous carcinoma",

    # Pancreas
    "PAAD":  r"pancreas|pancreatic",

    # Pheochromocytoma / Paraganglioma
    "PCPG":  r"pheochromocytoma|paraganglioma",

    # Prostate
    "PRAD":  r"prostate",

    # Sarcoma
    "SARC":  r"sarcoma|rhabdoid|schwannoma|fibro|leiomyo",

    # Skin / Melanoma
    "SKCM":  r"melanoma|skin",

    # Stomach
    "STAD":  r"stomach|gastric",

    # Testis / Germ cell
    "TGCT":  r"testicular|germ cell",

    # Thyroid
    "THCA":  r"thyroid",

    # Thymus
    "THYM":  r"thymus|thymoma",

    # Uterine / Endometrial
    "UCEC":  r"endometrial|uterine corpus",
    "UCS":   r"uterine carcinosarcoma",

    # Uveal / Ocular
    "UVM":   r"uveal|ocular|eye",
}

def assign_tcga_type(text: str) -> str:
    text = str(text).lower()
    for code, pattern in tcga_map.items():
        if re.search(pattern, text):
            return code
    return "UNKNOWN"

meta["tcga_label"] = meta["text"].apply(assign_tcga_type)

# Drop unknowns
meta = meta[meta["tcga_label"] != "UNKNOWN"].reset_index(drop=True)

print(meta["tcga_label"].value_counts())


meta.head()

meta.to_csv(os.path.join(output_dir, "archs4_cancer_metadata_labeled.csv"), index=False)
print(f"üß¨ Saved labeled metadata: {meta.shape}")



üè∑Ô∏è  Assigning cancer type labels...
tcga_label
BRCA    32327
DLBC    16410
COAD    15022
LAML    14090
LUAD    13356
PRAD    10994
SKCM    10139
LIHC     7335
LGG      5871
HNSC     5459
OV       4535
GBM      4354
PAAD     3671
STAD     2691
BLCA     2334
CESC     1956
KIRC     1507
SARC     1493
ESCA     1481
THCA     1246
CHOL      719
LUSC      534
READ      533
UCEC      425
TGCT      138
ACC        54
MESO       20
UVM        18
KICH       13
THYM       10
PCPG        2
Name: count, dtype: int64
üß¨ Saved labeled metadata: (158737, 5)


In [None]:
meta.head()

Unnamed: 0,geo_accession,characteristics_ch1,source_name_ch1,text,tcga_label
0,GSM1013480,"cell line: HL60,rna fraction: Total RNA",Myeloid Leukemia,"cell line: hl60,rna fraction: total rna myeloi...",LAML
1,GSM1013481,"cell line: HL60,rna fraction: Non-polyadenylat...",Myeloid Leukemia,"cell line: hl60,rna fraction: non-polyadenylat...",LAML
2,GSM1019735,"cell line: HT29,cell type: colon cancer,treatm...",HT29 treated with 0 √é¬ºM of 5-Aza,"cell line: ht29,cell type: colon cancer,treatm...",COAD
3,GSM1019736,"cell line: HT29,cell type: colon cancer,treatm...",HT29 treated with 0 √é¬ºM of 5-Aza,"cell line: ht29,cell type: colon cancer,treatm...",COAD
4,GSM1019737,"cell line: HT29,cell type: colon cancer,treatm...",HT29 treated with 0 √é¬ºM of 5-Aza,"cell line: ht29,cell type: colon cancer,treatm...",COAD


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming you already have:
# meta with columns: ["geo_accession", "text", "tcga_label", ...]
# and you‚Äôve removed UNKNOWNs

print(f"üìä Total labeled samples: {len(meta):,}")
print(meta["tcga_label"].value_counts().head())

# -------------------------------------------------------------
# 1Ô∏è‚É£ TRAIN‚ÄìTEST SPLIT (80/20) stratified by cancer type
# -------------------------------------------------------------
train_df, temp_df = train_test_split(
    meta,
    test_size=0.2,
    stratify=meta["tcga_label"],
    random_state=42
)

# -------------------------------------------------------------
# 2Ô∏è‚É£ VALIDATION‚ÄìTEST SPLIT (from that 20%)
# -------------------------------------------------------------
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,             # 10% val, 10% test overall
    stratify=temp_df["tcga_label"],
    random_state=42
)

# -------------------------------------------------------------
# 3Ô∏è‚É£ VERIFY SPLIT PROPORTIONS
# -------------------------------------------------------------
def summarize_split(df, name):
    counts = df["tcga_label"].value_counts(normalize=True) * 100
    print(f"\n{name} set ({len(df):,} samples)")
    print(counts.round(2).head(10))

summarize_split(train_df, "TRAIN")
summarize_split(val_df, "VAL")
summarize_split(test_df, "TEST")

# -------------------------------------------------------------
# 4Ô∏è‚É£ SAVE SPLITS FOR LATER USE
# -------------------------------------------------------------
output_dir = "./data/archs4/splits"
os.makedirs(output_dir, exist_ok=True)

train_df.to_csv(f"{output_dir}/train_metadata.csv", index=False)
val_df.to_csv(f"{output_dir}/val_metadata.csv", index=False)
test_df.to_csv(f"{output_dir}/test_metadata.csv", index=False)

print("\n‚úÖ Saved stratified train/val/test metadata splits!")


üìä Total labeled samples: 158,737
tcga_label
BRCA    32327
DLBC    16410
COAD    15022
LAML    14090
LUAD    13356
Name: count, dtype: int64

TRAIN set (126,989 samples)
tcga_label
BRCA    20.37
DLBC    10.34
COAD     9.46
LAML     8.88
LUAD     8.41
PRAD     6.93
SKCM     6.39
LIHC     4.62
LGG      3.70
HNSC     3.44
Name: proportion, dtype: float64

VAL set (15,874 samples)
tcga_label
BRCA    20.36
DLBC    10.34
COAD     9.46
LAML     8.88
LUAD     8.42
PRAD     6.93
SKCM     6.39
LIHC     4.62
LGG      3.70
HNSC     3.44
Name: proportion, dtype: float64

TEST set (15,874 samples)
tcga_label
BRCA    20.37
DLBC    10.34
COAD     9.46
LAML     8.88
LUAD     8.41
PRAD     6.92
SKCM     6.39
LIHC     4.62
LGG      3.70
HNSC     3.44
Name: proportion, dtype: float64

‚úÖ Saved stratified train/val/test metadata splits!
