In [3]:
import pandas as pd

# Load the combined table
her2_df = pd.read_csv('her2.csv')
mag_df = pd.read_csv('magnifications.csv')

# Get the list of Sample_ID values
her2_sample_ids = her2_df['Sample_ID'].dropna().unique().tolist()  # remove NaNs and duplicates
print(f"Loaded {len(her2_sample_ids)} unique sample IDs")

mag_sample_ids = mag_df['Sample_ID'].dropna().unique().tolist()  # remove NaNs and duplicates
print(f"Loaded {len(mag_sample_ids)} unique sample IDs")

sample_ids = list(set(mag_sample_ids) - set(her2_sample_ids))
print(f"{len(sample_ids)} sample IDs in mag_df but not in her2_df")

Loaded 376 unique sample IDs
Loaded 451 unique sample IDs
177 sample IDs in mag_df but not in her2_df


In [None]:
!wget https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_Ubuntu14.04_x64.zip
!unzip gdc-client

In [None]:
!chmod +x gdc-client

In [4]:
import requests
import json
import pandas as pd

def generate_svs_manifest(sample_ids, output_manifest='gdc_manifest.tsv'):
    """
    Generate a GDC manifest file for downloading SVS files corresponding to the provided Sample_IDs.

    Parameters:
    - sample_ids: List of TCGA Sample_IDs (e.g., ['TCGA-AG-3731', 'TCGA-CK-4947'])
    - output_manifest: Filename for the output manifest TSV file
    """
    # Define the API endpoint
    files_endpt = "https://api.gdc.cancer.gov/files"

    # Construct the filters for the API query
    filters = {
        "op": "and",
        "content": [
            {
                "op": "in",
                "content": {
                    "field": "cases.submitter_id",
                    "value": sample_ids
                }
            },
            {
                "op": "=",
                "content": {
                    "field": "files.data_type",
                    "value": "Slide Image"
                }
            },
            {
                "op": "=",
                "content": {
                    "field": "files.data_format",
                    "value": "SVS"
                }
            }
        ]
    }

    # Define the parameters for the API request
    params = {
        "filters": json.dumps(filters),
        "fields": "file_id,file_name,cases.submitter_id",
        "format": "JSON",
        "size": "1000"
    }

    # Make the API request
    response = requests.get(files_endpt, params=params)
    response.raise_for_status()
    data = response.json()
    
    # Extract the relevant information
    records = []
    for file_entry in data['data']['hits']:
        records.append({
            'id': file_entry['file_id'],
            'file_name': file_entry['file_name'],
            'sample_id': file_entry['cases'][0]['submitter_id']
        })

    # Create a DataFrame and save as TSV
    df = pd.DataFrame(records)
    df[['id', 'file_name']].to_csv(output_manifest, sep='\t', index=False)

    print(f"Manifest file '{output_manifest}' created with {len(df)} entries.")


In [5]:
from time import sleep

# Chunking logic
chunk_size = 200
for i in range(0, len(sample_ids), chunk_size):
    chunk = sample_ids[i:i + chunk_size]
    chunk_name = f'diff_manifest_part_{i//chunk_size + 1}.tsv'

    # Call your original generate_svs_manifest on this chunk
    print(f"Processing chunk {i//chunk_size + 1} with {len(chunk)} IDs")
    generate_svs_manifest(chunk, output_manifest=chunk_name)

    # Optional: pause between API calls
    sleep(1)

Processing chunk 1 with 177 IDs
Manifest file 'her2_manifest_part_1.tsv' created with 620 entries.


In [34]:
# from glob import glob

# parts = sorted(glob("her2_manifest_part_*.tsv"))
# df_all = pd.concat([pd.read_csv(p, sep='\t') for p in parts])
# df_all.drop_duplicates(subset='id').to_csv("her2_manifest_combined.tsv", sep='\t', index=False)

# print(f"Final combined manifest has {len(df_all)} entries")

Final combined manifest has 1085 entries


In [6]:
# remove non-FFPE (no DX1)

import pandas as pd

# 1. Read your unfiltered manifest
df = pd.read_csv('diff_manifest_part_1.tsv', sep='\t')

# 2. Keep only the DX1 filenames
df_dx1 = df[df['file_name'].str.contains('-DX1', case=False)]

# 3. Write out the new manifest
df_dx1.to_csv('diff_manifest_clean.tsv', sep='\t', index=False)
print(f"{len(df_dx1)} DX1 entries written.")

# 4. Then from shell:
#    ./gdc-client download -m gdc_manifest_dx1.tsv -d BRAF_MSI_svs_DX1

177 DX1 entries written.


In [9]:
# !./gdc-client download -m her2_manifest_clean.tsv -d her2_svs

import pandas as pd
import os

# Load manifest
df = pd.read_csv('diff_manifest_clean.tsv', sep='\t')

# Ensure output directory exists
os.makedirs("manifests", exist_ok=True)

# Generate per-file manifest
for _, row in df.iterrows():
    with open(f"manifests/{row['id']}.tsv", 'w') as f:
        f.write("id\tfile_name\n")
        f.write(f"{row['id']}\t{row['file_name']}\n")

# 10 workers CPUs        
!ls manifests/*.tsv | xargs -P 20 -I {} ./gdc-client download -m {} -d diff_svs

100% [############################################] Time:  0:00:04  14.5 MiB/s 
 61% [##########################                  ] ETA:   0:00:02  26.7 MiB/s [32mSuccessfully downloaded[0m: 1
100% [############################################] Time:  0:00:05  25.0 MiB/s 
100% [############################################] Time:  0:00:05  26.1 MiB/s 
 35% [###############                             ] ETA:   0:00:09  29.1 MiB/s [32mSuccessfully downloaded[0m: 1
100% [############################################] Time:  0:00:05  17.2 MiB/s 
 64% [############################                ] ETA:   0:00:03  25.7 MiB/s [32mSuccessfully downloaded[0m: 1
 24% [##########                                  ] ETA:   0:00:19  39.0 MiB/s [32mSuccessfully downloaded[0m: 1
100% [############################################] Time:  0:00:06  27.7 MiB/s 
100% [############################################] Time:  0:00:06  28.0 MiB/s 
100% [############################################] Time:  0

In [10]:
import os
import shutil

# Root download directory from gdc-client
input_dir = "diff_svs"
output_dir = "diff_svs_flat"

# Make sure output dir exists
os.makedirs(output_dir, exist_ok=True)

# Walk through the nested folders
for root, dirs, files in os.walk(input_dir):
    for fname in files:
        if fname.endswith(".svs"):
            full_path = os.path.join(root, fname)

            # Extract Sample_ID from filename (usually the prefix)
            sample_id = fname.split('.')[0]

            # Rename and copy to flat output folder
            dest_path = os.path.join(output_dir, f"{sample_id}.svs")
            shutil.move(full_path, dest_path)

            print(f"moved {full_path} → {dest_path}")

moved diff_svs/9b44570d-51dd-4cc4-8ec5-37ede80aca31/TCGA-AA-3994-01Z-00-DX1.ca18c0cb-88b4-4a31-be1f-cca57dfadabc.svs → diff_svs_flat/TCGA-AA-3994-01Z-00-DX1.svs
moved diff_svs/b5d1e455-5c25-4ad1-87b5-b56776d17baf/TCGA-AA-A00W-01Z-00-DX1.24770462-BD63-4881-9AE3-9198E9093AD9.svs → diff_svs_flat/TCGA-AA-A00W-01Z-00-DX1.svs
moved diff_svs/cae38e3c-38b7-40ef-9a59-2a60c5aeb1c0/TCGA-AA-3930-01Z-00-DX1.065c480c-9ac3-4d98-a351-cb320b6a5ba0.svs → diff_svs_flat/TCGA-AA-3930-01Z-00-DX1.svs
moved diff_svs/e45b588f-c7db-4cea-a78c-bfecd1a46da6/TCGA-A6-2671-01Z-00-DX1.13d1a0d9-78cd-4cfc-b670-34a79ebe52ee.svs → diff_svs_flat/TCGA-A6-2671-01Z-00-DX1.svs
moved diff_svs/ac5cf94c-5829-4da1-80be-8d24eb798058/TCGA-AA-3862-01Z-00-DX1.67a0bc0d-1fe0-4c90-bb2d-5b12224cc846.svs → diff_svs_flat/TCGA-AA-3862-01Z-00-DX1.svs
moved diff_svs/fac4903a-786f-4976-a7c4-6fce3f27e83f/TCGA-AA-A03F-01Z-00-DX1.8E1A83FE-2C85-4444-A8FC-D0691817968A.svs → diff_svs_flat/TCGA-AA-A03F-01Z-00-DX1.svs
moved diff_svs/e95e537b-3824-421d-

In [11]:
folder = "diff_svs_flat"
for fname in os.listdir(folder):
    if fname.endswith("DX1.svs"):
        parts = fname.split('-')
        sample_id = "-".join(parts[:3])
        old_path = os.path.join(folder, fname)
        new_path = os.path.join(folder, f"{sample_id}.svs")

        if not os.path.exists(new_path):
            os.rename(old_path, new_path)
            print(f"🔁 Renamed {fname} → {sample_id}.svs")
        else:
            print(f"⚠️  Skipped {fname} — already renamed")

🔁 Renamed TCGA-AA-3854-01Z-00-DX1.svs → TCGA-AA-3854.svs
🔁 Renamed TCGA-AA-3534-01Z-00-DX1.svs → TCGA-AA-3534.svs
🔁 Renamed TCGA-AA-3971-01Z-00-DX1.svs → TCGA-AA-3971.svs
🔁 Renamed TCGA-AA-3516-01Z-00-DX1.svs → TCGA-AA-3516.svs
🔁 Renamed TCGA-AA-3555-01Z-00-DX1.svs → TCGA-AA-3555.svs
🔁 Renamed TCGA-AA-A00N-01Z-00-DX1.svs → TCGA-AA-A00N.svs
🔁 Renamed TCGA-AA-A02R-01Z-00-DX1.svs → TCGA-AA-A02R.svs
🔁 Renamed TCGA-AA-A02K-01Z-00-DX1.svs → TCGA-AA-A02K.svs
🔁 Renamed TCGA-AA-A00W-01Z-00-DX1.svs → TCGA-AA-A00W.svs
🔁 Renamed TCGA-AA-3680-01Z-00-DX1.svs → TCGA-AA-3680.svs
🔁 Renamed TCGA-AA-3667-01Z-00-DX1.svs → TCGA-AA-3667.svs
🔁 Renamed TCGA-AA-3679-01Z-00-DX1.svs → TCGA-AA-3679.svs
🔁 Renamed TCGA-AA-3552-01Z-00-DX1.svs → TCGA-AA-3552.svs
🔁 Renamed TCGA-AA-A01G-01Z-00-DX1.svs → TCGA-AA-A01G.svs
🔁 Renamed TCGA-A6-2681-01Z-00-DX1.svs → TCGA-A6-2681.svs
🔁 Renamed TCGA-A6-2678-01Z-00-DX1.svs → TCGA-A6-2678.svs
🔁 Renamed TCGA-AA-3976-01Z-00-DX1.svs → TCGA-AA-3976.svs
🔁 Renamed TCGA-AA-3968-01Z-00-D