# Create a sample_set for each lab_batch in the sample table

Run the cell below to parse through all samples in the `sample` table and generate a `sample_set` for each set of samples with a matching `lab_batch`. The created sample_sets will be named just like the `lab_batch`.

This notebook can only _add_ samples to sample_sets (or create them), it cannot remove samples from existing sample_sets. This means that if the `lab_batch` of a sample is changed from `A` to `B`, it will be added to a sample_set `B` but not removed from `A`.

In [None]:
import firecloud.api as fapi
import json
import csv
import os
from datetime import datetime, timezone
import pytz

workspace_namespace = os.environ.get('WORKSPACE_NAMESPACE')
workspace_name = os.environ.get('WORKSPACE_NAME')


def group_samples_into_batches(table_name, available_tables):
    
    if os.path.exists('CreateSampleSets_data'):
        os.system('rm -r CreateSampleSets_data')
    os.system('mkdir -p CreateSampleSets_data')
    
    existing_sample_sets = dict()
    
    if f'{table_name}_set' in available_tables:
        # Download current sample_set table
        print(f'Downloading {table_name}_set table...')
        sample_set_response = fapi.get_entities(workspace_namespace, workspace_name, f'{table_name}_set')
        if not sample_set_response.ok:
            raise RuntimeError(f'ERROR: {sample_set_response.text}')
        sample_sets_dict = json.loads(sample_set_response.text)
        existing_sample_sets = {s['name']:[e['entityName'] for e in s['attributes'][f'{table_name}s']['items']] for s in sample_sets_dict}

    # Read samples from samples table
    print(f'Reading {table_name} table...')
    sample_response = fapi.get_entities(workspace_namespace, workspace_name, f'{table_name}')
    if not sample_response.ok:
        raise RuntimeError(f'ERROR: {sample_response.text}')

    samples = json.loads(sample_response.text)
    # Writing new sample_set_membership.tsv
    added_sample_sets = set()
    with open(f'CreateSampleSets_data/new_{table_name}_set_membership.tsv', 'w') as new_membership_file:
        # Write header
        new_membership_file.write(f'membership:{table_name}_set_id\t{table_name}\n')
        for sample in samples:
            if 'lab_batch' not in sample['attributes']:
                continue
            sample_name = sample['name']
            lab_batch = sample['attributes']['lab_batch']
            if lab_batch in existing_sample_sets and \
                    sample_name in existing_sample_sets[lab_batch]:
                continue
            new_membership_file.write(f'{lab_batch}\t{sample_name}\n')
            added_sample_sets.add(lab_batch)

    if len(added_sample_sets) == 0:
        print(f'No new {table_name}_sets to be added.')
    else:
        if f'{table_name}_set' not in available_tables:
            print(f'Creating new table {table_name}_set')
            # Need to upload tsv to creat new table
            with open(f'CreateSampleSets_data/new_{table_name}_set.tsv', 'w') as new_set_table:
                new_set_table.write(f'entity:{table_name}_set_id\n')
                for lab_batch in added_sample_sets:
                    new_set_table.write(f'{lab_batch}\n')
            upload_new_table_response = fapi.upload_entities_tsv(workspace_namespace, workspace_name, f'CreateSampleSets_data/new_{table_name}_set.tsv', "flexible")
            if not upload_new_table_response.ok:
                raise RuntimeError(f'ERROR: {upload_new_table_response.text}')
        print(f'Uploading new {table_name}_set table... ')
        upload_response = fapi.upload_entities_tsv(workspace_namespace, workspace_name, f'CreateSampleSets_data/new_{table_name}_set_membership.tsv', "flexible")
        if not upload_response.ok:
            raise RuntimeError(f'ERROR: {upload_response.text}')
        # Add date and time created to sample_set
        print(f'Adding date and time to newly created {table_name}_sets...')

        now = str(datetime.now(pytz.timezone('US/Eastern')))
        for i, lab_batch in enumerate(added_sample_sets):
            update_response = fapi.update_entity(workspace_namespace, workspace_name, f'{table_name}_set', lab_batch, [{"op": "AddUpdateAttribute", "attributeName": "time_sample_set_created", "addUpdateAttribute": now }])
            if not update_response.ok:
                raise RuntimeError(f'ERROR: {update_response.text}')
            print(f'    Completed {i+1}/{len(added_sample_sets)}')
        # Uploading new sample_set table
        print('SUCCESS')
        print(f'Printing update {table_name}_set_membership.tsv:')
        os.system(f'cat CreateSampleSets_data/new_{table_name}_set_membership.tsv')
    os.system('rm -r CreateSampleSets_data')

    
print('Finding tables to group by lab_batch')
entity_types_response = fapi.list_entity_types(workspace_namespace, workspace_name)
if not entity_types_response.ok:
    raise RuntimeError(f'ERROR: {entity_types_response.text}')
    
entity_types_dict = json.loads(entity_types_response.text)
available_tables = entity_types_dict.keys()
for table_name, description in entity_types_dict.items():
    if all(x in description['attributeNames'] for x in ['is_control_sample', 'lab_batch']):
        group_samples_into_batches(table_name, available_tables)
