In [62]:
import pandas as pd
import math
import requests
import pdb
import datetime
import os

# Real G0-arrest labels

In [59]:
root = '/Users/awxlong/Desktop/my-studies/hpc_exps/Data/'
cohort_name = 'COAD'
task_name = 'g0_arrest'
df1 = pd.read_csv(f'{root}local_cohort_{cohort_name}.csv')
df2 = pd.read_csv(f'{root}{cohort_name}_{task_name}.csv')

In [60]:
df1_patients = set(df1['PatientID'].unique())
df2_patients = set(df2['PatientID'].unique())

missing_patients = df2_patients - df1_patients
# missing_patients = df1_patients - df2_patients
len(missing_patients)

df3 = pd.merge(df2, df1, on='PatientID')
df3.shape

(435, 14)

In [61]:
df3['PatientID'].nunique() + 165
427 + 149
592 - (576)

16

# get patientIDs with Diagnostic slides

In [None]:
# import requests
# import requests
import json

def query_gdc(patient_ids):
    base_url = "https://api.gdc.cancer.gov/"
    results = {}
    
    for patient_id in patient_ids:
        # Query for diagnostic slides
        slides_query = {
            "filters": {
                "op": "and",
                "content": [
                    {"op": "in", "content": {"field": "cases.submitter_id", "value": [patient_id]}},
                    {"op": "in", "content": {"field": "files.data_type", "value": ["Slide Image"]}},
                    {"op": "in", "content": {"field": "files.experimental_strategy", "value": ["Diagnostic Slide"]}}
                ]
            },
            "format": "JSON",
            "size": "1"
        }
        
        slides_response = requests.post(f"{base_url}files", json=slides_query)
        slides_data = json.loads(slides_response.content)
        
        has_diagnostic_slide = len(slides_data['data']['hits']) > 0
        
        # Query for tissue type
        case_query = {
            "filters": {
                "op": "in",
                "content": {"field": "cases.submitter_id", "value": [patient_id]}
            },
            "format": "JSON",
            "fields": "cases.samples.tissue_type",
            "size": "1"
        }
        
        case_response = requests.post(f"{base_url}cases", json=case_query)
        case_data = json.loads(case_response.content)
        # pdb.set_trace()
        tissue_type = case_data['data']['hits'][0]['primary_site']# ['samples'][0]['tissue_type'] if case_data['data']['hits'] else "Not available"
        
        # Query for clinical data
        clinical_query = {
            "filters": {
                "op": "and",
                "content": [
                    {"op": "in", "content": {"field": "cases.submitter_id", "value": [patient_id]}},
                    {"op": "in", "content": {"field": "files.data_category", "value": ["clinical"]}},
                    # {"op": "in", "content": {"field": "files.data_format", "value": ["XML"]}}
                ]
            },
            "format": "JSON",
            "fields": "file_name,file_id,data_format,data_type",
            "size": "1000"  # Increase size to ensure we get all files
        }
        
        clinical_response = requests.post(f"{base_url}files", json=clinical_query)
        clinical_data = json.loads(clinical_response.content)
        
        clinical_files = clinical_data['data']['hits']
        total_clinical_files = len(clinical_files)
        xml_files = [file for file in clinical_files if file['file_name'].lower().endswith('.xml') or file['data_format'].lower() == 'xml']
        xml_file_count = len(xml_files)

        has_clinical_data = total_clinical_files > 0

        print(f"Patient ID: {patient_id}")
        print(f"Has diagnostic slide: {has_diagnostic_slide}")
        print(f"Tissue type: {tissue_type}")
        print(f"Has clinical data: {has_clinical_data}")
        print(f"Total number of clinical files: {total_clinical_files}")
        print(f"Number of clinical XML files: {xml_file_count}")
        print('-----')
        print()

        results[patient_id] = {
            "has_diagnostic_slide": has_diagnostic_slide,
            "tissue_type": tissue_type,
            "has_clinical_xml": has_clinical_data,
            "clinical_xml_count": xml_file_count,
            "total_clinical_files": total_clinical_files
        }

    return results

def get_tissue_type(case_data):
    if not case_data['data']['hits']:
        return "Not available"
    
    case = case_data['data']['hits'][0]
    
    if 'samples' in case and case['samples']:
        return case['samples'][0].get('tissue_type', "Not available")
    
    return "Not available"

def save_results(results):
    # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"gdc_query_results.json"
    
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"Results saved to {filename}")

    # Create a filtered version with only patients having both diagnostic slides and clinical XML
    filtered_results = {
        patient_id: data for patient_id, data in results.items()
        if data['has_diagnostic_slide'] and data['has_clinical_xml']
    }
    
    filtered_filename = f"filtered_gdc_query_results.json"
    with open(filtered_filename, 'w') as f:
        json.dump(filtered_results, f, indent=2)
    
    print(f"Filtered results saved to {filtered_filename}")

# Example usage
patient_ids = list(missing_patients) # Replace with your list of patient IDs
results = query_gdc(patient_ids)
save_results(results)


# generate manifest.txt for remaining patients

In [None]:
import json
import requests

def read_filtered_results(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def get_eligible_patient_ids(results):
    return [
        patient_id for patient_id, data in results.items()
        if data['has_diagnostic_slide'] and data['has_clinical_xml']
    ]

def generate_gdc_manifest(patient_ids):
    base_url = "https://api.gdc.cancer.gov/"
    manifest_data = []

    for patient_id in patient_ids:
        query = {
            "filters": {
                "op": "and",
                "content": [
                    {"op": "in", "content": {"field": "cases.submitter_id", "value": [patient_id]}},
                    {"op": "in", "content": {"field": "files.data_type", "value": ["Slide Image"]}},
                    {"op": "in", "content": {"field": "files.experimental_strategy", "value": ["Diagnostic Slide"]}}
                ]
            },
            "fields": "file_id,file_name,md5sum,file_size",
            "format": "JSON",
            "size": "1000"
        }

        response = requests.post(f"{base_url}files", json=query)
        data = json.loads(response.content)

        for file in data['data']['hits']:
            manifest_data.append({
                "id": file['file_id'],
                "filename": file['file_name'],
                "md5": file['md5sum'],
                "size": file['file_size'],
                # "state": file['state']
            })

    return manifest_data

def write_gdc_manifest(manifest_data, filename="gdc_manifest.txt"):
    with open(filename, 'w') as f:
        f.write("id\tfilename\tmd5\tsize\n")
        for item in manifest_data:
            f.write(f"{item['id']}\t{item['filename']}\t{item['md5']}\t{item['size']}\n")
    print(f"GDC manifest file created: {filename}")




In [None]:
# Main execution
json_filename = "gdc_query_results.json"  # Replace with your actual filename
results = read_filtered_results(json_filename)
eligible_patient_ids = get_eligible_patient_ids(results)

print(f"Number of eligible patients: {len(eligible_patient_ids)}")
print("Generating GDC manifest...")

manifest_data = generate_gdc_manifest(eligible_patient_ids)
write_gdc_manifest(manifest_data)

print("Process completed.")

# download clinical data of remaining patients

In [66]:


def read_filtered_results(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def get_patients_with_diagnostic_slides(results):
    return [
        patient_id for patient_id, data in results.items()
        if data['has_diagnostic_slide']
    ]

def get_clinical_data_files(patient_ids):
    base_url = "https://api.gdc.cancer.gov/"
    clinical_files = []

    for patient_id in patient_ids:
        query = {
            "filters": {
                "op": "and",
                "content": [
                    {"op": "in", "content": {"field": "cases.submitter_id", "value": [patient_id]}},
                    {"op": "in", "content": {"field": "files.data_category", "value": ["clinical"]}},
                    # {"op": "in", "content": {"field": "files.data_format", "value": ["XML", "JSON"]}}
                ]
            },
            "fields": "file_id,file_name,data_format,data_type,cases.submitter_id",
            "format": "JSON",
            "size": "1000"
        }

        response = requests.post(f"{base_url}files", json=query)
        data = json.loads(response.content)

        for file in data['data']['hits']:
            clinical_files.append({
                "file_id": file['file_id'],
                "file_name": file['file_name'],
                "data_format": file['data_format'],
                "data_type": file['data_type'],
                "patient_id": file['cases'][0]['submitter_id']
            })

    return clinical_files

def download_clinical_files(clinical_files, download_dir):
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    data_endpt = "https://api.gdc.cancer.gov/data/"

    for file in clinical_files:
        response = requests.get(f"{data_endpt}{file['file_id']}", headers={"Content-Type": "application/json"})
        
        if response.status_code == 200:
            file_path = os.path.join(download_dir, f"{file['patient_id']}_{file['file_name']}")
            with open(file_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {file_path}")
        else:
            print(f"Failed to download file for patient {file['patient_id']}: {response.status_code}")

# Main execution
json_filename = "filtered_gdc_query_results.json"  # Replace with your actual filename
download_directory = "/Users/awxlong/Desktop/my-studies/temp_data/COAD/TCGA-COAD/"  # Replace with your desired download directory

results = read_filtered_results(json_filename)
patients_with_slides = get_patients_with_diagnostic_slides(results)

print(f"Number of patients with diagnostic slides: {len(patients_with_slides)}")
print("Retrieving clinical data file information...")

clinical_files = get_clinical_data_files(patients_with_slides)

print(f"Number of clinical data files found: {len(clinical_files)}")
print("Downloading clinical data files...")

download_clinical_files(clinical_files, download_directory)

print("Process completed.")


In [72]:
import json
import requests
import os
import subprocess


def read_filtered_results(filename):
    with open(filename, 'r') as f:
        return json.load(f)

def get_patients_with_diagnostic_slides(results):
    return [
        patient_id for patient_id, data in results.items()
        if data['has_diagnostic_slide']
    ]

def get_clinical_data_files(patient_ids):
    base_url = "https://api.gdc.cancer.gov/"
    clinical_files = []

    for patient_id in patient_ids:
        query = {
            "filters": {
                "op": "and",
                "content": [
                    {"op": "in", "content": {"field": "cases.submitter_id", "value": [patient_id]}},
                    {"op": "in", "content": {"field": "files.data_category", "value": ["clinical"]}},
                    # {"op": "in", "content": {"field": "files.data_format", "value": ["XML", "JSON"]}}
                ]
            },
            "fields": "file_id,file_name,data_format,data_type,cases.submitter_id",
            "format": "JSON",
            "size": "1000"
        }

        response = requests.post(f"{base_url}files", json=query)
        data = json.loads(response.content)

        for file in data['data']['hits']:
            clinical_files.append({
                "file_id": file['file_id'],
                "file_name": file['file_name'],
                "data_format": file['data_format'],
                "data_type": file['data_type'],
                "patient_id": file['cases'][0]['submitter_id']
            })

    return clinical_files

def create_manifest_file(clinical_files, manifest_filename="gdc_manifest.txt"):
    with open(manifest_filename, 'w') as f:
        f.write("id\tfilename\tmd5\tsize\tstate\n")
        for file in clinical_files:
            f.write(f"{file['file_id']}\t{file['file_name']}\t\t\t\n")
    print(f"Manifest file created: {manifest_filename}")
    return manifest_filename

def download_files_with_gdc_client(manifest_file, download_dir):
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    command = f"/Users/awxlong/Downloads/gdc-client download -m {manifest_file} -d {download_dir}"
    
    print("Starting bulk download with gdc-client...")
    subprocess.run(command, shell=True, check=True)
    print("Bulk download completed.")

# Main execution
json_filename = "filtered_gdc_query_results.json"  # Replace with your actual filename
download_directory = "/Users/awxlong/Desktop/my-studies/temp_data/COAD/TCGA-COAD/"  # Replace with your desired download directory

results = read_filtered_results(json_filename)
patients_with_slides = get_patients_with_diagnostic_slides(results)

print(f"Number of patients with diagnostic slides: {len(patients_with_slides)}")
print("Retrieving clinical data file information...")

clinical_files = get_clinical_data_files(patients_with_slides)

print(f"Number of clinical data files found: {len(clinical_files)}")

manifest_file = create_manifest_file(clinical_files)

download_files_with_gdc_client(manifest_file, download_directory)

print("Process completed.")


Number of patients with diagnostic slides: 149
Retrieving clinical data file information...
Number of clinical data files found: 1346
Manifest file created: gdc_manifest.txt
Starting bulk download with gdc-client...


100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [#######################################################################] 
100% [##################################

100% [############################################] Time:  0:00:01  32.7 KiB/s 
100% [############################################] Time:  0:00:00  18.8 KiB/s 
100% [############################################] Time:  0:00:01 272.4 KiB/s 
100% [############################################] Time:  0:00:01 179.4 KiB/s 
100% [############################################] Time:  0:00:00  51.8 KiB/s 
100% [############################################] Time:  0:00:01 113.4 KiB/s 
100% [############################################] Time:  0:00:00  11.1 KiB/s 
100% [############################################] Time:  0:00:00  46.2 KiB/s 
100% [############################################] Time:  0:00:01   1.1 KiB/s 
100% [############################################] Time:  0:00:00  11.9 KiB/s 
100% [############################################] Time:  0:00:01  30.7 KiB/s 
100% [############################################] Time:  0:00:01  43.9 KiB/s 
100% [##################################