## Look at the h5 files

In [26]:
%pwd

'/orcd/data/edboyden/002/ezh/uni'

In [26]:
import h5py

with h5py.File("virchow_features/TCGA-A6-2672.h5", "r") as f:
    print(list(f.keys()))
    print(f["features"].shape)  # (N_patches, 1536)
    if "KRAS_binary" in f:
        print(f["KRAS_binary"][()]) 

['Lymphovascular invasion indicator', 'coords', 'features']
(3102, 2560)


In [6]:
import h5py
import os
from glob import glob

# Set the directory where the .h5 files are located
h5_folder = "CLAM/TCGA-COAD/"  # 🔁 Replace with your actual path
h5_paths = sorted(glob(os.path.join(h5_folder, "**/*.h5"), recursive=True))

# Track patch counts
patch_counts = []

for path in h5_paths:
    try:
        with h5py.File(path, "r") as f:
            # Try "features" key first
            data = f["features"][:]
            # Squeeze if necessary
            if data.ndim == 3:
                data = data[0]
            patch_counts.append((os.path.basename(path), data.shape[0]))
    except Exception as e:
        patch_counts.append((os.path.basename(path), f"Error: {e}"))

# Convert to DataFrame for display
import pandas as pd
import ace_tools as tools

patch_df = pd.DataFrame(patch_counts, columns=["File", "Num_Patches"])
tools.display_dataframe_to_user(name="Patch Count per Slide", dataframe=patch_df)


KeyboardInterrupt: 

Search for nonzero annotations in all the h5 files

In [10]:
import os
from glob import glob

def has_nonzero_annots(h5_path):
    with h5py.File(h5_path, "r") as f:
        if "annots" in f:
            annots = f["annots"][:]
            return np.any(annots != 0)
    return False

folder = "/UNI2-h_features/TCGA-COAD/"
paths = sorted(glob(os.path.join(folder, "**/*.h5"), recursive=True))

nonzero_annot_slides = [p for p in paths if has_nonzero_annots(p)]

print(f"Found {len(nonzero_annot_slides)} slides with non-zero annotations.")
for path in nonzero_annot_slides:
    print("found", os.path.basename(path))


Found 0 slides with non-zero annotations.


## Look at TSV file downloaded from CBioPortal for COAD-Read

In [19]:
import pandas as pd
import pprint as pp

clinical_path = "all_tcga_datasets.tsv"  # or full path if not in current dir
df = pd.read_csv(clinical_path, sep="\t")

print("Loaded TSV file with shape:", df.shape)
# print("\nColumn names:")
display(df.columns.tolist())

Loaded TSV file with shape: (640, 145)


['Study ID',
 'Patient ID',
 'Sample ID',
 'Diagnosis Age',
 'American Joint Committee on Cancer Publication Version Type',
 'Biopsy Site',
 'Cancer Type',
 'Cancer Type Detailed',
 'Disease Free (Months)',
 'Disease Free Status',
 'Disease Type',
 'Ethnicity Category',
 'Fraction Genome Altered',
 'ICD-10 Classification',
 'Is FFPE',
 'Morphology',
 'Mutation Count',
 'Oncotree Code',
 'Overall Survival (Months)',
 'Overall Survival Status',
 'Other Patient ID',
 'Other Sample ID',
 'AJCC Pathologic M-Stage',
 'AJCC Pathologic N-Stage',
 'AJCC Pathologic Stage',
 'AJCC Pathologic T-Stage',
 'Primary Diagnosis',
 'Patient Primary Tumor Site',
 'Prior Malignancy',
 'Prior Treatment',
 'Project Identifier',
 'Project Name',
 'Project State',
 'Race Category',
 'Number of Samples Per Patient',
 'Sample Type',
 'Sex',
 "Patient's Vital Status",
 'Year of Death',
 'Year of Diagnosis',
 'American Joint Committee on Cancer Metastasis Stage Code',
 'Neoplasm Disease Lymph Node Stage American J

In [26]:
display(df['KRAS Mutation'])

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
635    NaN
636    NaN
637    NaN
638    NaN
639    YES
Name: KRAS Mutation, Length: 640, dtype: object

In [27]:
# look at KRAS mutation column
if 'KRAS Mutation' in df.columns:
    print("\nKRAS Mutation Value Counts:")
    print(df['KRAS Mutation'].value_counts(dropna=False))
else:
    print("\n'KRAS Mutation' column not found in file.")


KRAS Mutation Value Counts:
NaN    623
YES     10
NO       7
Name: KRAS Mutation, dtype: int64


## Look at data that might contain MMR status

In [7]:
import pandas as pd

# Step 1: Load the wide format TSV file
mmr_path = "Human__TCGA_COADREAD__MS__Clinical__Clinical__01_28_2016__BI__Clinical__Firehose.tsi.txt"
mmr_df = pd.read_csv(mmr_path, sep="\t", index_col=0)

# Step 2: Transpose so each row is a sample (patient), each column an attribute
mmr_df = mmr_df.T
mmr_df.index.name = "Sample_ID"

# Step 3: Preview
display(mmr_df.shape)
display(mmr_df.columns.tolist())  # column names like 'Mutation_Phenotype'
display(mmr_df.head(20))  # first 3 samples


(629, 17)

['years_to_birth',
 'Tumor_purity',
 'pathologic_stage',
 'pathology_T_stage',
 'pathology_N_stage',
 'pathology_M_stage',
 'histological_type',
 'number_of_lymph_nodes',
 'Proteomic.subtype',
 'gender',
 'radiation_therapy',
 'residual_tumor',
 'ethnicity',
 'MSI_phenotype',
 'overall_survival',
 'status',
 'overallsurvival']

attrib_name,years_to_birth,Tumor_purity,pathologic_stage,pathology_T_stage,pathology_N_stage,pathology_M_stage,histological_type,number_of_lymph_nodes,Proteomic.subtype,gender,radiation_therapy,residual_tumor,ethnicity,MSI_phenotype,overall_survival,status,overallsurvival
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
TCGA.A6.2677,68,0.8869,stageiii,t3,n2,m0,colonadenocarcinoma,5,,female,no,r0,nothispanicorlatino,MSS,740.0,1.0,7401
TCGA.A6.2681,73,0.6939,stageii,t3,n0,m0,colonadenocarcinoma,0,,female,no,r0,nothispanicorlatino,MSS,1387.0,0.0,13870
TCGA.A6.2682,70,0.7927,stageiv,t4,n1,m1,colonadenocarcinoma,2,,male,no,r0,nothispanicorlatino,MSS,424.0,1.0,4241
TCGA.A6.4105,79,0.7533,stageii,t3,n0,m0,colonadenocarcinoma,0,,male,no,,nothispanicorlatino,MSS,442.0,1.0,4421
TCGA.A6.6649,66,0.7857,stageiii,t3,n1,m0,colonadenocarcinoma,2,,male,no,,nothispanicorlatino,MSS,735.0,0.0,7350
TCGA.AA.3496,83,0.7004,stageii,t3,n0,m0,colonadenocarcinoma,0,,female,,r0,,MSS,31.0,0.0,310
TCGA.AA.3506,77,0.7979,stagei,t2,n0,m0,colonadenocarcinoma,0,,male,no,r0,,MSS,1765.0,0.0,17650
TCGA.AA.3516,74,0.6843,stageiii,t3,n2,m0,colonmucinousadenocarcinoma,7,,female,no,,,MSI-H,396.0,1.0,3961
TCGA.AA.3520,86,0.7372,stageii,t3,n0,m0,colonadenocarcinoma,0,,female,no,r0,,MSS,731.0,0.0,7310
TCGA.AA.3521,87,0.8137,stageii,t3,n0,m0,colonadenocarcinoma,0,,male,,r0,,MSS,,,"NA,NA"


In [31]:
print(mmr_df["MSI_phenotype"].value_counts(dropna=False))

# Map MSI-H to 1 (dMMR), MSS to 0 (pMMR)
mmr_df["MMR_status"] = mmr_df["MSI_phenotype"].map({
    "MSI-H": 1,          # dMMR
    "MSS": 0,            # pMMR
})


# Drop any samples with NA
mmr_df = mmr_df.dropna(subset=["MMR_status"])

# Check label distribution
print(mmr_df["MMR_status"].value_counts())



# display(mmr_df["MMR_status"])

MSS      537
MSI-H     89
Name: MSI_phenotype, dtype: int64
0    537
1     89
Name: MMR_status, dtype: int64


## Match each h5 file to label

In [14]:
import pandas as pd

# Load the file
mutation_df = pd.read_csv('lymphovascular_invasion.csv')

display(mutation_df.head(10))

Unnamed: 0,Sample_ID,Lymphovascular invasion indicator
0,TCGA-3L-AA1B,0.0
1,TCGA-4N-A93T,0.0
2,TCGA-4T-AA8H,0.0
3,TCGA-5M-AAT4,
4,TCGA-5M-AAT5,
5,TCGA-5M-AAT6,1.0
6,TCGA-5M-AATA,
7,TCGA-5M-AATE,0.0
8,TCGA-A6-2670,0.0
9,TCGA-A6-2671,1.0


In [15]:
import os
import pandas as pd
from glob import glob

In [20]:
h5_folder = "virchow_features"
mutation = "Lymphovascular invasion indicator"
h5_paths = sorted(glob(os.path.join(h5_folder, "*.h5")))
print(f"\nFound {len(h5_paths)} .h5 files")


Found 493 .h5 files


In [17]:
# Extract TCGA ID from each .h5 filename
def extract_tcga_id(path):
    basename = os.path.basename(path)
    return "-".join(basename.split("-")[:3])  # TCGA-XX-YYYY

h5_df = pd.DataFrame({
    "h5_path": h5_paths,
    "Sample_ID": [extract_tcga_id(p) for p in h5_paths]
})

h5_df['Sample_ID'] = h5_df['Sample_ID'].str.replace('.h5', '')

print("\nFirst 5 extracted TCGA IDs:")
print(h5_df.head())


First 5 extracted TCGA IDs:
                            h5_path     Sample_ID
0  virchow_features/TCGA-3L-AA1B.h5  TCGA-3L-AA1B
1  virchow_features/TCGA-4N-A93T.h5  TCGA-4N-A93T
2  virchow_features/TCGA-4T-AA8H.h5  TCGA-4T-AA8H
3  virchow_features/TCGA-5M-AAT4.h5  TCGA-5M-AAT4
4  virchow_features/TCGA-5M-AAT5.h5  TCGA-5M-AAT5


  h5_df['Sample_ID'] = h5_df['Sample_ID'].str.replace('.h5', '')


In [18]:
print("\nSample_ID format in mutation_df:")
print(mutation_df['Sample_ID'].head())

print("\nSample_ID format in h5_df:")
print(h5_df['Sample_ID'].head())


Sample_ID format in mutation_df:
0    TCGA-3L-AA1B
1    TCGA-4N-A93T
2    TCGA-4T-AA8H
3    TCGA-5M-AAT4
4    TCGA-5M-AAT5
Name: Sample_ID, dtype: object

Sample_ID format in h5_df:
0    TCGA-3L-AA1B
1    TCGA-4N-A93T
2    TCGA-4T-AA8H
3    TCGA-5M-AAT4
4    TCGA-5M-AAT5
Name: Sample_ID, dtype: object


In [21]:
merged_df = h5_df.merge(mutation_df, on="Sample_ID", how="left")

display(merged_df.head(10))

# Separate matched/unmatched again
matched = merged_df.dropna(subset=[mutation])
unmatched = merged_df[merged_df[mutation].isna()]

# Print results
print(f"\nMatched: {len(matched)}")
print(f"Unmatched: {len(unmatched)}")
print("\nMatched sample preview:")
print(matched[["Sample_ID", mutation]].head(5))

Unnamed: 0,h5_path,Sample_ID,Lymphovascular invasion indicator
0,virchow_features/TCGA-3L-AA1B.h5,TCGA-3L-AA1B,0.0
1,virchow_features/TCGA-4N-A93T.h5,TCGA-4N-A93T,0.0
2,virchow_features/TCGA-4T-AA8H.h5,TCGA-4T-AA8H,0.0
3,virchow_features/TCGA-5M-AAT4.h5,TCGA-5M-AAT4,
4,virchow_features/TCGA-5M-AAT5.h5,TCGA-5M-AAT5,
5,virchow_features/TCGA-5M-AAT6.h5,TCGA-5M-AAT6,1.0
6,virchow_features/TCGA-5M-AATE.h5,TCGA-5M-AATE,0.0
7,virchow_features/TCGA-A6-2671.h5,TCGA-A6-2671,1.0
8,virchow_features/TCGA-A6-2672.h5,TCGA-A6-2672,1.0
9,virchow_features/TCGA-A6-2674.h5,TCGA-A6-2674,0.0



Matched: 456
Unmatched: 37

Matched sample preview:
      Sample_ID  Lymphovascular invasion indicator
0  TCGA-3L-AA1B                                0.0
1  TCGA-4N-A93T                                0.0
2  TCGA-4T-AA8H                                0.0
5  TCGA-5M-AAT6                                1.0
6  TCGA-5M-AATE                                0.0


In [22]:
print(matched[mutation].value_counts())

0.0    281
1.0    175
Name: Lymphovascular invasion indicator, dtype: int64


### Save the mutation status column in the h5 file

In [23]:
import h5py

# Loop over matched rows
for idx, row in matched.iterrows():
    h5_path = row["h5_path"]
    label = int(row[mutation])  # make sure it's a simple 0 or 1
    
    try:
        with h5py.File(h5_path, "a") as f:  # 'a' = append mode
            if mutation in f:
                print(f"Overwriting existing {mutation} in {os.path.basename(h5_path)}")
                del f[mutation]
            else:
                print(f"Writing {mutation} to {os.path.basename(h5_path)}")
            
            f.create_dataset(mutation, data=label)
    
    except Exception as e:
        print(f"Error writing to {h5_path}: {e}")

Writing Lymphovascular invasion indicator to TCGA-3L-AA1B.h5
Writing Lymphovascular invasion indicator to TCGA-4N-A93T.h5
Writing Lymphovascular invasion indicator to TCGA-4T-AA8H.h5
Writing Lymphovascular invasion indicator to TCGA-5M-AAT6.h5
Writing Lymphovascular invasion indicator to TCGA-5M-AATE.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2671.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2672.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2674.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2675.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2676.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2677.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2678.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2679.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2680.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2681.h5
Writing Lymphovascular invasion indicator to TCGA-A6-2682.h5
Writing Lymphovascular i

In [24]:
print(matched.keys())
display(matched)

Index(['h5_path', 'Sample_ID', 'Lymphovascular invasion indicator'], dtype='object')


Unnamed: 0,h5_path,Sample_ID,Lymphovascular invasion indicator
0,virchow_features/TCGA-3L-AA1B.h5,TCGA-3L-AA1B,0.0
1,virchow_features/TCGA-4N-A93T.h5,TCGA-4N-A93T,0.0
2,virchow_features/TCGA-4T-AA8H.h5,TCGA-4T-AA8H,0.0
5,virchow_features/TCGA-5M-AAT6.h5,TCGA-5M-AAT6,1.0
6,virchow_features/TCGA-5M-AATE.h5,TCGA-5M-AATE,0.0
...,...,...,...
487,virchow_features/TCGA-QG-A5Z1.h5,TCGA-QG-A5Z1,1.0
488,virchow_features/TCGA-QG-A5Z2.h5,TCGA-QG-A5Z2,0.0
490,virchow_features/TCGA-SS-A7HO.h5,TCGA-SS-A7HO,1.0
491,virchow_features/TCGA-T9-A92H.h5,TCGA-T9-A92H,0.0


In [25]:
import h5py
import os
from glob import glob
import pandas as pd

h5_paths = sorted(glob(os.path.join(h5_folder, "*.h5")))

records = []

for path in h5_paths:
    filename = os.path.basename(path)
    sample_id = "-".join(filename.split("-")[:3])  # TCGA-XX-YYYY
    
    try:
        with h5py.File(path, "r") as f:
            if mutation in f:
                mmr = f[mutation][()]
                print(f"{sample_id}, {mmr}")
                records.append((sample_id, mmr, filename))
            else:
                print(f"{sample_id} missing mutation")
                records.append((sample_id, None, filename))
    except Exception as e:
        print(f"Error reading {filename}: {e}")
        records.append((sample_id, None, filename))

mutation_read_df = pd.DataFrame(records, columns=["Sample_ID", mutation, "Filename"])
display(mutation_read_df)

TCGA-3L-AA1B.h5, 0
TCGA-4N-A93T.h5, 0
TCGA-4T-AA8H.h5, 0
TCGA-5M-AAT4.h5 missing mutation
TCGA-5M-AAT5.h5 missing mutation
TCGA-5M-AAT6.h5, 1
TCGA-5M-AATE.h5, 0
TCGA-A6-2671.h5, 1
TCGA-A6-2672.h5, 1
TCGA-A6-2674.h5, 0
TCGA-A6-2675.h5, 0
TCGA-A6-2676.h5, 0
TCGA-A6-2677.h5, 1
TCGA-A6-2678.h5, 1
TCGA-A6-2679.h5, 0
TCGA-A6-2680.h5, 0
TCGA-A6-2681.h5, 0
TCGA-A6-2682.h5, 1
TCGA-A6-2683.h5, 0
TCGA-A6-2684.h5, 0
TCGA-A6-2685.h5, 0
TCGA-A6-2686.h5, 0
TCGA-A6-3807.h5, 1
TCGA-A6-3808.h5, 0
TCGA-A6-3809.h5, 0
TCGA-A6-3810.h5, 0
TCGA-A6-4105.h5, 0
TCGA-A6-4107.h5, 1
TCGA-A6-5656.h5, 0
TCGA-A6-5657.h5, 1
TCGA-A6-5659.h5, 0
TCGA-A6-5660.h5, 1
TCGA-A6-5661.h5, 0
TCGA-A6-5662.h5, 1
TCGA-A6-5664.h5, 1
TCGA-A6-5665.h5, 1
TCGA-A6-5666.h5, 0
TCGA-A6-5667.h5, 1
TCGA-A6-6137.h5, 0
TCGA-A6-6138.h5, 0
TCGA-A6-6140.h5, 0
TCGA-A6-6141.h5, 0
TCGA-A6-6142.h5, 1
TCGA-A6-6648.h5, 0
TCGA-A6-6649.h5, 1
TCGA-A6-6650.h5, 0
TCGA-A6-6651.h5, 1
TCGA-A6-6652.h5, 0
TCGA-A6-6653.h5, 0
TCGA-A6-6654.h5, 1
TCGA-A6-A565.h5, 1
TCG

Unnamed: 0,Sample_ID,Lymphovascular invasion indicator,Filename
0,TCGA-3L-AA1B.h5,0.0,TCGA-3L-AA1B.h5
1,TCGA-4N-A93T.h5,0.0,TCGA-4N-A93T.h5
2,TCGA-4T-AA8H.h5,0.0,TCGA-4T-AA8H.h5
3,TCGA-5M-AAT4.h5,,TCGA-5M-AAT4.h5
4,TCGA-5M-AAT5.h5,,TCGA-5M-AAT5.h5
...,...,...,...
488,TCGA-QG-A5Z2.h5,0.0,TCGA-QG-A5Z2.h5
489,TCGA-QL-A97D.h5,,TCGA-QL-A97D.h5
490,TCGA-SS-A7HO.h5,1.0,TCGA-SS-A7HO.h5
491,TCGA-T9-A92H.h5,0.0,TCGA-T9-A92H.h5
