In [None]:
import time
import os
import json
from pathlib import Path
import subprocess
import pandas as pd

# -------------------------------------------------------
# 1. Locate ONE parcel
# -------------------------------------------------------
tcga_root = Path("/mnt/d/BioFM/data/tcga")
extract_root = tcga_root / "extracted"
extract_root.mkdir(exist_ok=True)

parcel_files = list(tcga_root.rglob("*.parcel"))
if not parcel_files:
    raise FileNotFoundError("No .parcel files found.")

parcel = parcel_files[0]
print(f"Parcel selected:\n{parcel}\n")


# -------------------------------------------------------
# 2. Unpack and time it
# -------------------------------------------------------
start = time.time()

cmd = ["./utils/gdc-client", "unpack", str(parcel), "-d", str(extract_root)]
result = subprocess.run(cmd, capture_output=True, text=True)

elapsed = time.time() - start

if result.returncode != 0:
    print("Error running gdc-client unpack:\n")
    print(result.stderr)
else:
    print(result.stdout or "Unpack completed.")


print(f"\nUnpack time: {elapsed:.2f} seconds\n")


# -------------------------------------------------------
# 3. Compute total size of extracted files
# -------------------------------------------------------
def get_dir_size(path):
    total = 0
    for root, dirs, files in os.walk(path):
        for f in files:
            fp = os.path.join(root, f)
            total += os.path.getsize(fp)
    return total

size_bytes = get_dir_size(extract_root)
size_mb = size_bytes / (1024 * 1024)

print(f"Total extracted size: {size_mb:.2f} MB\n")


# -------------------------------------------------------
# 4. List extracted files
# -------------------------------------------------------
print("Extracted files:")
for f in extract_root.iterdir():
    print(" -", f.name)


# -------------------------------------------------------
# 5. Load metadata + counts (quick preview)
# -------------------------------------------------------
metadata_files = list(extract_root.glob("*.json"))
tsv_files = [p for p in extract_root.glob("*.tsv") if "counts" in p.name]

if metadata_files:
    with open(metadata_files[0]) as f:
        meta = json.load(f)
    
    print("\nCohort:", meta["cases"][0]["project"]["project_id"])
    print("Sample barcode:", meta["cases"][0]["submitter_id"])

if tsv_files:
    df = pd.read_csv(tsv_files[0], sep="\t")
    print(f"\nTSV size: {df.shape}")
    display(df.head())


In [None]:
import os
import json
import time
from pathlib import Path
import subprocess
import pandas as pd

# -----------------------------------------
# Paths
# -----------------------------------------
tcga_root = Path("/mnt/d/BioFM/data/tcga")
extract_root = tcga_root / "extracted_test"
extract_root.mkdir(exist_ok=True)

# -----------------------------------------
# Step 1 — find ONE parcel file
# -----------------------------------------
parcel_files = list(tcga_root.rglob("*.parcel"))
if not parcel_files:
    raise FileNotFoundError("No .parcel files found in tcga directory.")

parcel = parcel_files[0]
print(f"Selected parcel:\n{parcel}\n")

# -----------------------------------------
# Step 2 — unpack
# -----------------------------------------
start = time.time()

cmd = ["./utils/gdc-client", "unpack", str(parcel), "-d", str(extract_root)]
print("Running:", " ".join(cmd))

result = subprocess.run(cmd, capture_output=True, text=True)

elapsed = time.time() - start

if result.returncode != 0:
    print("\n❌ Error unpacking:")
    print(result.stderr)
else:
    print("\n✓ Unpacked successfully.")
    print(result.stdout)

print(f"\nElapsed time: {elapsed:.2f} seconds\n")

# -----------------------------------------
# Step 3 — identify extracted files
# -----------------------------------------
all_extracted = list(extract_root.iterdir())

print("Extracted files:")
for f in all_extracted:
    print("  -"
