In [31]:
import pandas as pd

# Load the combined table
df = pd.read_csv('her2.csv')

# Get the list of Sample_ID values
sample_ids = df['Sample_ID'].dropna().unique().tolist()  # remove NaNs and duplicates
print(f"Loaded {len(sample_ids)} unique sample IDs")

Loaded 376 unique sample IDs


In [None]:
!wget https://gdc.cancer.gov/files/public/file/gdc-client_v1.6.1_Ubuntu14.04_x64.zip
!unzip gdc-client

In [None]:
!chmod +x gdc-client

In [28]:
import requests
import json
import pandas as pd

def generate_svs_manifest(sample_ids, output_manifest='gdc_manifest.tsv'):
    """
    Generate a GDC manifest file for downloading SVS files corresponding to the provided Sample_IDs.

    Parameters:
    - sample_ids: List of TCGA Sample_IDs (e.g., ['TCGA-AG-3731', 'TCGA-CK-4947'])
    - output_manifest: Filename for the output manifest TSV file
    """
    # Define the API endpoint
    files_endpt = "https://api.gdc.cancer.gov/files"

    # Construct the filters for the API query
    filters = {
        "op": "and",
        "content": [
            {
                "op": "in",
                "content": {
                    "field": "cases.submitter_id",
                    "value": sample_ids
                }
            },
            {
                "op": "=",
                "content": {
                    "field": "files.data_type",
                    "value": "Slide Image"
                }
            },
            {
                "op": "=",
                "content": {
                    "field": "files.data_format",
                    "value": "SVS"
                }
            }
        ]
    }

    # Define the parameters for the API request
    params = {
        "filters": json.dumps(filters),
        "fields": "file_id,file_name,cases.submitter_id",
        "format": "JSON",
        "size": "1000"
    }

    # Make the API request
    response = requests.get(files_endpt, params=params)
    response.raise_for_status()
    data = response.json()
    
    # Extract the relevant information
    records = []
    for file_entry in data['data']['hits']:
        records.append({
            'id': file_entry['file_id'],
            'file_name': file_entry['file_name'],
            'sample_id': file_entry['cases'][0]['submitter_id']
        })

    # Create a DataFrame and save as TSV
    df = pd.DataFrame(records)
    df[['id', 'file_name']].to_csv(output_manifest, sep='\t', index=False)

    print(f"Manifest file '{output_manifest}' created with {len(df)} entries.")


In [33]:
from time import sleep

# Chunking logic
chunk_size = 200
for i in range(0, len(sample_ids), chunk_size):
    chunk = sample_ids[i:i + chunk_size]
    chunk_name = f'her2_manifest_part_{i//chunk_size + 1}.tsv'

    # Call your original generate_svs_manifest on this chunk
    print(f"Processing chunk {i//chunk_size + 1} with {len(chunk)} IDs")
    generate_svs_manifest(chunk, output_manifest=chunk_name)

    # Optional: pause between API calls
    sleep(1)

Processing chunk 1 with 200 IDs
Manifest file 'her2_manifest_part_1.tsv' created with 558 entries.
Processing chunk 2 with 176 IDs
Manifest file 'her2_manifest_part_2.tsv' created with 527 entries.


In [34]:
from glob import glob

parts = sorted(glob("her2_manifest_part_*.tsv"))
df_all = pd.concat([pd.read_csv(p, sep='\t') for p in parts])
df_all.drop_duplicates(subset='id').to_csv("her2_manifest_combined.tsv", sep='\t', index=False)

print(f"Final combined manifest has {len(df_all)} entries")

Final combined manifest has 1085 entries


In [35]:
# remove non-FFPE (no DX1)

import pandas as pd

# 1. Read your unfiltered manifest
df = pd.read_csv('her2_manifest_combined.tsv', sep='\t')

# 2. Keep only the DX1 filenames
df_dx1 = df[df['file_name'].str.contains('-DX1', case=False)]

# 3. Write out the new manifest
df_dx1.to_csv('her2_manifest_clean.tsv', sep='\t', index=False)
print(f"{len(df_dx1)} DX1 entries written.")

# 4. Then from shell:
#    ./gdc-client download -m gdc_manifest_dx1.tsv -d BRAF_MSI_svs_DX1

362 DX1 entries written.


In [5]:
# !./gdc-client download -m her2_manifest_clean.tsv -d her2_svs

import pandas as pd
import os

# Load manifest
df = pd.read_csv('her2_manifest_clean.tsv', sep='\t')

# Ensure output directory exists
os.makedirs("manifests", exist_ok=True)

# Generate per-file manifest
for _, row in df.iterrows():
    with open(f"manifests/{row['id']}.tsv", 'w') as f:
        f.write("id\tfile_name\n")
        f.write(f"{row['id']}\t{row['file_name']}\n")

# 10 workers CPUs        
!ls manifests/*.tsv | xargs -P 10 -I {} ./gdc-client download -m {} -d her2_svs

 43% [###################                         ] ETA:   0:00:03  29.8 MiB/s [32mSuccessfully downloaded[0m: 1
 51% [######################                      ] ETA:   0:00:02  33.3 MiB/s [32mSuccessfully downloaded[0m: 1
 19% [########                                    ] ETA:   0:00:11  32.7 MiB/s [32mSuccessfully downloaded[0m: 1
100% [############################################] Time:  0:00:04  42.6 MiB/s 
 52% [######################                      ] ETA:   0:00:03  58.2 MiB/s [32mSuccessfully downloaded[0m: 1
 38% [################                            ] ETA:   0:00:07  58.0 MiB/s [32mSuccessfully downloaded[0m: 1
100% [############################################] Time:  0:00:07  64.8 MiB/s 
100% [############################################] Time:  0:00:07  63.2 MiB/s 
 31% [#############                               ] ETA:   0:00:07  58.7 MiB/s [32mSuccessfully downloaded[0m: 1
 84% [#####################################       ] ETA:   0:00:01  76

In [6]:
import os
import shutil

# Root download directory from gdc-client
input_dir = "her2_svs"
output_dir = "her2_svs_flat"

# Make sure output dir exists
os.makedirs(output_dir, exist_ok=True)

# Walk through the nested folders
for root, dirs, files in os.walk(input_dir):
    for fname in files:
        if fname.endswith(".svs"):
            full_path = os.path.join(root, fname)

            # Extract Sample_ID from filename (usually the prefix)
            sample_id = fname.split('.')[0]

            # Rename and copy to flat output folder
            dest_path = os.path.join(output_dir, f"{sample_id}.svs")
            shutil.move(full_path, dest_path)

            print(f"moved {full_path} → {dest_path}")

moved her2_svs/95f9ca4b-edc6-43cd-8d05-ff92a4ddca49/TCGA-AZ-4684-01Z-00-DX1.1c29deb2-b0e2-4788-a3e8-83ecab7f9208.svs → her2_svs_flat/TCGA-AZ-4684-01Z-00-DX1.svs
moved her2_svs/b7c23d21-c9b8-4d40-b32d-01a7081e5189/TCGA-AA-3675-01Z-00-DX1.9afbbb26-2574-46af-8154-5f39bab6f01a.svs → her2_svs_flat/TCGA-AA-3675-01Z-00-DX1.svs
moved her2_svs/dbfe866c-fc72-4423-9667-9f03c6f76123/TCGA-CM-4751-01Z-00-DX1.F72E1883-5293-4351-A8DC-C4EA5D8F797C.svs → her2_svs_flat/TCGA-CM-4751-01Z-00-DX1.svs
moved her2_svs/0b9fadfc-f3ba-4af2-899c-bf804369fd55/TCGA-A6-A567-01Z-00-DX1.F941874E-9BF7-4E8B-908C-41A638D62275.svs → her2_svs_flat/TCGA-A6-A567-01Z-00-DX1.svs
moved her2_svs/21b6d67b-524b-4172-a741-32790e6d9742/TCGA-NH-A50V-01Z-00-DX1.408BA0A6-E569-4464-A8CB-D6553A4DF9E0.svs → her2_svs_flat/TCGA-NH-A50V-01Z-00-DX1.svs
moved her2_svs/023ca8d1-d8ef-492e-b542-e5e16e1ded09/TCGA-A6-6653-01Z-00-DX1.e130666d-2681-4382-9e7a-4a4d27cb77a4.svs → her2_svs_flat/TCGA-A6-6653-01Z-00-DX1.svs
moved her2_svs/67a751aa-bfd4-46b5-

In [9]:
folder = "her2_svs_flat"
for fname in os.listdir(folder):
    if fname.endswith("DX1.svs"):
        parts = fname.split('-')
        sample_id = "-".join(parts[:3])
        old_path = os.path.join(folder, fname)
        new_path = os.path.join(folder, f"{sample_id}.svs")

        if not os.path.exists(new_path):
            os.rename(old_path, new_path)
            print(f"🔁 Renamed {fname} → {sample_id}.svs")
        else:
            print(f"⚠️  Skipped {fname} — already renamed")

🔁 Renamed TCGA-CK-6746-01Z-00-DX1.svs → TCGA-CK-6746.svs
🔁 Renamed TCGA-A6-5667-01Z-00-DX1.svs → TCGA-A6-5667.svs
🔁 Renamed TCGA-AU-3779-01Z-00-DX1.svs → TCGA-AU-3779.svs
🔁 Renamed TCGA-D5-6920-01Z-00-DX1.svs → TCGA-D5-6920.svs
🔁 Renamed TCGA-F4-6856-01Z-00-DX1.svs → TCGA-F4-6856.svs
🔁 Renamed TCGA-EI-6884-01Z-00-DX1.svs → TCGA-EI-6884.svs
🔁 Renamed TCGA-G4-6303-01Z-00-DX1.svs → TCGA-G4-6303.svs
🔁 Renamed TCGA-A6-6653-01Z-00-DX1.svs → TCGA-A6-6653.svs
🔁 Renamed TCGA-DC-6160-01Z-00-DX1.svs → TCGA-DC-6160.svs
🔁 Renamed TCGA-DM-A0XD-01Z-00-DX1.svs → TCGA-DM-A0XD.svs
🔁 Renamed TCGA-AA-3712-01Z-00-DX1.svs → TCGA-AA-3712.svs
🔁 Renamed TCGA-AH-6547-01Z-00-DX1.svs → TCGA-AH-6547.svs
🔁 Renamed TCGA-G4-6321-01Z-00-DX1.svs → TCGA-G4-6321.svs
🔁 Renamed TCGA-DM-A28K-01Z-00-DX1.svs → TCGA-DM-A28K.svs
🔁 Renamed TCGA-DY-A1DG-01Z-00-DX1.svs → TCGA-DY-A1DG.svs
🔁 Renamed TCGA-CM-5860-01Z-00-DX1.svs → TCGA-CM-5860.svs
🔁 Renamed TCGA-F4-6704-01Z-00-DX1.svs → TCGA-F4-6704.svs
🔁 Renamed TCGA-D5-6538-01Z-00-D