# Fetch WGS metadata for samples from list

This notebook is intended to be run from a workspace to pull in metadata from the WGS QC workspace, given a file containing a list of samples to pull.

Please copy this notebook into your workspace and run it there.

In [1]:
# uncomment and run this if you get an error importing toolz - you'll need to restart your notebook kernel after runniing this cell
!pip install toolz
!ls -l

total 188836
-rw-rw-r-- 1 welder-user users    18391 Oct 12 19:42 'Fetch WGS metadata for samples from list.ipynb'
-rw-rw-r-- 1 welder-user users     5076 Oct 12 19:05 'Parse Delcho Sample List.ipynb'
-rw-rw-r-- 1 jupyter     users     1498 Oct 12 15:55  parse_delcho_sample_tsv.py
-rw-rw-r-- 1 jupyter     users 96701674 Oct 12 15:55  wgs_all_terra_delcho_v1_Oct042023_1420_10_tz0000.tsv
-rw-rw-r-- 1 jupyter     users 96632331 Oct 12 16:02  wgs_pass_qc_terra_filtered_except_is_missing_from_cdr_delcho_v1_Oct042023_1420_10_tz0000.tsv


In [2]:
import os
import json
import math
from firecloud import api as fapi
from pprint import pprint
from tqdm import tqdm
import pandas as pd
from toolz.itertoolz import partition_all

## Set up source and target workspace info. You don't need to edit anything in the following cell, assuming this notebook lives in the workspace to which you'd like to copy the metadata

In [3]:
# the data will be copied TO the workspace in which the notebook is run
target_ws_project = os.environ['WORKSPACE_NAMESPACE']
target_ws_name = os.environ['WORKSPACE_NAME']
target_ws_bucket = os.environ['WORKSPACE_BUCKET']

# the data will be copied FROM the WGS QC workspace
source_ws_project = "allofus-drc-wgs-prod"
source_ws_name = "AoU_DRC_WGS"
source_ws_bucket = "gs://fc-secure-089622a2-ae22-4dc8-ad01-634bd7634d5f"

print(f"will copy data from {source_ws_project}/{source_ws_name} to {target_ws_project}/{target_ws_name}")

will copy data from allofus-drc-wgs-prod/AoU_DRC_WGS to allofus-drc-wgs-dev/GVS AoU Echo RD


## Define and fetch the file that contains the list of samples you want to fetch

In [4]:
# EDIT THIS: localize the file containing requested samples
sample_list_file_path = "gs://fc-secure-4c3976f3-d84d-4243-876f-baa9f9a4256f/wgs_pass_qc_terra_filtered_except_is_missing_from_cdr_delcho_v1_Oct042023_1420_10_tz0000.tsv"

sample_list_file = sample_list_file_path.split("/")[-1]
!gsutil cp $sample_list_file_path $sample_list_file
!cat $sample_list_file | wc -l
!head $sample_list_file -n 3

Copying gs://fc-secure-4c3976f3-d84d-4243-876f-baa9f9a4256f/wgs_pass_qc_terra_filtered_except_is_missing_from_cdr_delcho_v1_Oct042023_1420_10_tz0000.tsv...
| [1 files][ 92.2 MiB/ 92.2 MiB]                                                
Operation completed over 1 objects/92.2 MiB.                                     
382517
	research_id	drc_qc_status_wgs	aw2_processing_status_wgs	blocklist_reason_wgs	blocklist_flag_wgs	sample_source_wgs	drc_qc_status_array	aw2_processing_status_array	blocklist_reason_array	blocklist_flag_array	sample_source_array	sample_source	site_id	reblocked_gvcf	sample_id	flt_is_missing_from_cdr	flt_is_rid_duped	flt_suspicious_gvcf_path	flt_suspicious_rgvcf_path	flt_no_array_in_spot	flt_no_array_in_terra	flt_is_manual_blacklist
0	1000000						PASS	PASS		FALSE	WHOLE BLOOD	Whole Blood	bi	gs://prod-genomics-data-broad/ss_vcf_research/BI_A703323229_22088001176_1233191826_1.hard-filtered.gvcf.gz.reblocked.g.vcf.gz	1000000.22088001176	False	False	False	False	False	False	

## Extract the list of sample_ids from your file. 
NOTE: depending on how your input file was formed, you may need to do some munging to get the sample_id format - for short read WGS samples, the sample_id in Terra is composed of {research_id}.{aou_sample_id}

In [5]:
# extract just the list of samples
df_samples = pd.read_csv(sample_list_file, sep='\t', dtype=str)
df_samples.head()
sample_list = list(df_samples['sample_id'])


print(len(sample_list))
print(sample_list[:3])

382516
['1000000.22088001176', '1000004.21092004350', '1000033.21075004006']


## Perform the copy, in batches. 
Feel free to edit the batch size if it helps performance.

In [6]:
# copy data in batches
batch_size = 1000
total_copied = 0

for subset_sample_list in partition_all(batch_size, sample_list): 
    fapi.copy_entities(source_ws_project, source_ws_name, target_ws_project, target_ws_name, 
                       "sample", subset_sample_list, link_existing_entities=False)
    total_copied += len(subset_sample_list)
    print(f"copied {total_copied} samples out of {len(sample_list)}")

copied 1000 samples out of 382516
copied 2000 samples out of 382516
copied 3000 samples out of 382516
copied 4000 samples out of 382516
copied 5000 samples out of 382516
copied 6000 samples out of 382516
copied 7000 samples out of 382516
copied 8000 samples out of 382516
copied 9000 samples out of 382516
copied 10000 samples out of 382516
copied 11000 samples out of 382516
copied 12000 samples out of 382516
copied 13000 samples out of 382516
copied 14000 samples out of 382516
copied 15000 samples out of 382516
copied 16000 samples out of 382516
copied 17000 samples out of 382516
copied 18000 samples out of 382516
copied 19000 samples out of 382516
copied 20000 samples out of 382516
copied 21000 samples out of 382516
copied 22000 samples out of 382516
copied 23000 samples out of 382516
copied 24000 samples out of 382516
copied 25000 samples out of 382516
copied 26000 samples out of 382516
copied 27000 samples out of 382516
copied 28000 samples out of 382516
copied 29000 samples out of 3

copied 232000 samples out of 382516
copied 233000 samples out of 382516
copied 234000 samples out of 382516
copied 235000 samples out of 382516
copied 236000 samples out of 382516
copied 237000 samples out of 382516
copied 238000 samples out of 382516
copied 239000 samples out of 382516
copied 240000 samples out of 382516
copied 241000 samples out of 382516
copied 242000 samples out of 382516
copied 243000 samples out of 382516
copied 244000 samples out of 382516
copied 245000 samples out of 382516
copied 246000 samples out of 382516
copied 247000 samples out of 382516
copied 248000 samples out of 382516
copied 249000 samples out of 382516
copied 250000 samples out of 382516
copied 251000 samples out of 382516
copied 252000 samples out of 382516
copied 253000 samples out of 382516
copied 254000 samples out of 382516
copied 255000 samples out of 382516
copied 256000 samples out of 382516
copied 257000 samples out of 382516
copied 258000 samples out of 382516
copied 259000 samples out of

# now that the data have been copied, you can make sample sets if you wish

In [9]:
# function to make new sample set
def make_new_sample_set(set_name, sample_list):
    df_sample_set_table = pd.DataFrame({"membership:sample_set_id" : [set_name]*len(sample_list),
                                    "sample" : sample_list})

    # save to tsv
    sample_set_tsv = set_name + "_sample_set_table.tsv"
    df_sample_set_table.to_csv(sample_set_tsv, sep="\t", index=False)

    # upload to Terra
    response = fapi.upload_entities_tsv(target_ws_project, target_ws_name, sample_set_tsv, model="flexible")
    
    if response.status_code != 200:
        print("ERROR")
        print(response)
        pprint(response.text)
        exit(1)
    else:
        print(f"successfully created new sample_set {set_name} containing {len(sample_list)} samples")

In [11]:
# Make one big sample set
set_name = "Echo_ScaleTest"
print(f"creating sample set {set_name}...")
make_new_sample_set(set_name, sample_list)

# create batches of samples into sets
# SUBSET_SIZE = 50000


# all_set_samples = sample_list

# subset_number = 0

# new_subset_names = []

# for subset_sample_list in partition_all(SUBSET_SIZE, all_set_samples): 
#     # name the subset
#     subset_number += 1
#     subset_name = f"{set_name}_subset_{subset_number}"
#     print(f"creating sub set {subset_name}...")
#     make_new_sample_set(subset_name, subset_sample_list)
#     new_subset_names.append(subset_name)

# print(new_subset_names)

creating sample set Echo_ScaleTest...
ERROR
<Response [502]>
('\n'
 '<html><head>\n'
 '<meta http-equiv="content-type" content="text/html;charset=utf-8">\n'
 '<title>502 Server Error</title>\n'
 '</head>\n'
 '<body text=#000000 bgcolor=#ffffff>\n'
 '<h1>Error: Server Error</h1>\n'
 '<h2>The server encountered a temporary error and could not complete your '
 'request.<p>Please try again in 30 seconds.</h2>\n'
 '<h2></h2>\n'
 '</body></html>\n')
