# Create a sample_set for each lab_batch in the sample table

Run the cell below to parse through all samples in the `sample` table and generate a `sample_set` for each set of samples with a matching `lab_batch`. The created sample_sets will be named just like the `lab_batch`.

This notebook can only _add_ samples to sample_sets (or create them), it cannot remove samples from existing sample_sets. This means that if the `lab_batch` of a sample is changed from `A` to `B`, it will be added to a sample_set `B` but not removed from `A`.

In [None]:
import firecloud.api as fapi
import json
import csv
import os
from datetime import datetime, timezone
import pytz

workspace_namespace = os.environ.get('WORKSPACE_NAMESPACE')
workspace_name = os.environ.get('WORKSPACE_NAME')


# Download and unzip current sample_set table
print('Downloading and unzipping sample_set table...')
sample_set_response = fapi.get_entities_tsv(workspace_namespace, workspace_name, 'sample_set')
if not sample_set_response.ok:
    raise RuntimeError(f'ERROR: {sample_set_response.text}')

if os.path.exists('CreateSampleSets_data'):
    !rm -r CreateSampleSets_data
!mkdir -p CreateSampleSets_data
with open('CreateSampleSets_data/sample_set.zip', 'wb') as f:
    f.write(sample_set_response.content)
!cd CreateSampleSets_data && unzip sample_set.zip && cd ..

existing_sample_sets = dict()

with open('CreateSampleSets_data/sample_set_membership.tsv') as membership_file:
    for line in csv.DictReader(membership_file, delimiter='\t'):
        set_id = line['membership:sample_set_id']
        if set_id not in existing_sample_sets:
            existing_sample_sets[set_id] = set()
        existing_sample_sets[set_id].add(line['sample'])
        
# Read samples from samples table
print('Reading samples table...')
sample_response = fapi.get_entities(workspace_namespace, workspace_name, 'sample')
if not sample_response.ok:
    raise RuntimeError(f'ERROR: {sample_response.text}')

samples = json.loads(sample_response.text)

# Writing new sample_set_membership.tsv
added_sample_sets = set()
with open('CreateSampleSets_data/new_sample_set_membership.tsv', 'w') as new_membership_file:
    # Write header
    new_membership_file.write('membership:sample_set_id\tsample\n')
    for sample in samples:
        if 'lab_batch' not in sample['attributes']:
            continue
        sample_name = sample['name']
        lab_batch = sample['attributes']['lab_batch']
        
        if lab_batch in existing_sample_sets and \
                sample_name in existing_sample_sets[lab_batch]:
            continue
        new_membership_file.write(f'{lab_batch}\t{sample_name}\n')
        added_sample_sets.add(lab_batch)
    
if len(added_sample_sets) == 0:
    print('No new sample_sets to be added.')
else:
    # Uploading new sample_set table
    print('Uploading new sample_set table... ')
    upload_response = fapi.upload_entities_tsv(workspace_namespace, workspace_name, 'CreateSampleSets_data/new_sample_set_membership.tsv')
    if not upload_response.ok:
        raise RuntimeError(f'ERROR: {upload_response.text}')

    # Add date and time created to sample_set
    print('Adding date and time to newly created sample_sets...')
    
    now = str(datetime.now(pytz.timezone('US/Eastern')))
    for i, lab_batch in enumerate(added_sample_sets):
        update_response = fapi.update_entity(workspace_namespace, workspace_name, 'sample_set', lab_batch, [{"op": "AddUpdateAttribute", "attributeName": "time_sample_set_created", "addUpdateAttribute": now }])
        if not update_response.ok:
            raise RuntimeError(f'ERROR: {update_response.text}')
        print(f'    Completed {i+1}/{len(added_sample_sets)}')

    print('SUCCESS')
    print('Printing update sample_set_membership.tsv:')
    !cat CreateSampleSets_data/new_sample_set_membership.tsv
!rm -r CreateSampleSets_data