In [1]:
import pandas as pd

# Load both CSVs
mmr_df = pd.read_csv("msi_status.csv")   # Should have columns: Sample_ID, mmr_status
braf_df = pd.read_csv("braf_clean.csv")  # Should have columns: Sample_ID, BRAF

# Perform inner merge on Sample_ID (keeps only common Sample_IDs)
merged = pd.merge(mmr_df, braf_df, on="Sample_ID", how="inner")

# Optional: ensure column order
merged = merged[["Sample_ID", "mmr_status", "BRAF"]]

merged["mmr_braf_positive"] = ((merged["mmr_status"] == 1) & (merged["BRAF"] == 1)).astype(int)

# Save to new CSV
merged.to_csv("mmr_braf_combined.csv", index=False)


In [11]:
import pandas as pd
import os

# Load HER2 sample IDs
her2_df = pd.read_csv("her2.csv")
her2_ids = set(her2_df["Sample_ID"].astype(str))

# Directory containing files or folders (e.g., tile folders or .h5 files)
folder_path = "virchow_features"
existing_ids = set()

# Extract just the sample ID from filenames (e.g., TCGA-XX-XXXX.svs or TCGA-XX-XXXX.h5 or folder names)
for name in os.listdir(folder_path):
    base = os.path.splitext(name)[0]  # removes file extension
    existing_ids.add(base)

# Compare
found = her2_ids & existing_ids
missing = her2_ids - existing_ids

print(f"✅ Found: {len(found)}")
print(f"❌ Missing: {len(missing)}")

# Optionally, save missing IDs
with open("missing_sample_ids.txt", "w") as f:
    for sid in sorted(missing):
        f.write(sid + "\n")


✅ Found: 207
❌ Missing: 169
