## Download `sa-bac` runs from Terra workspace

In [1]:
import numpy as np
import pandas as pd
import os
import subprocess

In [5]:
sample_sheet_path = '/home/jupyter/mb-ml-data/MightyCodes/sa_bac/sample-03-07-2021.csv'
data_root = '/home/jupyter/mb-ml-data/MightyCodes/sa_bac'

In [6]:
# load sample sheet
sample_pd = pd.read_csv(sample_sheet_path, delimiter='\t')

In [7]:
sample_pd.columns

Index(['entity:sample_id', 'base_yaml_file', 'channel_model',
       'checkpoint_interval_seconds', 'code_length', 'convergence_abs_tol',
       'convergence_countdown', 'eval_split_size', 'experiment_prefix',
       'final_state', 'max_hamming_weight', 'metric_type',
       'mighty_codes_tar_gz', 'min_hamming_weight', 'n_types',
       'quality_factor', 'source_nonuniformity'],
      dtype='object')

In [8]:
def is_already_processed(output_path: str) -> bool:
    try:
        assert os.path.exists(os.path.join(output_path, 'latest_state.pkl'))
        assert os.path.exists(os.path.join(output_path, 'params.yaml'))
        assert os.path.exists(os.path.join(output_path, 'latest_codebook.pdf'))
        assert os.path.exists(os.path.join(output_path, 'latest_trajectory.pdf'))
        assert os.path.exists(os.path.join(output_path, 'resampling_buffer.pdf'))
    except AssertionError:
        return False
    return True

for row in sample_pd.iterrows():
    row_data = row[1]
    sample_id = row_data['entity:sample_id']
    file_state_gs_path = row_data['final_state']
    
    print(f'Processing {sample_id} ...')
    output_path = os.path.join(data_root, sample_id)

    if is_already_processed(output_path):
        print(f'{sample_id} is already downloaded and decompressed -- skipping.')
        continue

    # download
    print(f'Processing {file_state_gs_path} ...')
    os.makedirs(output_path, exist_ok=True)
    p = subprocess.Popen(
        ['gsutil', 'cp', file_state_gs_path, output_path],
        stderr=subprocess.PIPE,
        shell=False)
    p_stdout, p_stderr = p.communicate()
    if p.wait() != os.EX_OK:
        raise RuntimeError(
            f'Failed to localize {file_state_gs_path} -- please check the remote path '
            f'and/or your Google Cloud SDK configuration.')

    # check if download was successful
    local_final_state_tar_gz_path = os.path.join(output_path, 'final_state.tar.gz')
    if not os.path.exists(local_final_state_tar_gz_path):
        raise RuntimeError(
            f'Failed to localize {file_state_gs_path} -- please check the remote path '
            f'and/or your Google Cloud SDK configuration.')
    
    # decompress
    print(f'Decompressing {local_final_state_tar_gz_path} ...')
    p = subprocess.Popen(
        ['tar', 'xvzf', local_final_state_tar_gz_path, '--strip=1', '-C', output_path],
        stderr=subprocess.PIPE,
        shell=False)
    p_stdout, p_stderr = p.communicate()
    if p.wait() != os.EX_OK:
        raise RuntimeError(
            f'Failed to decompress {local_final_state_tar_gz_path}!')
    os.remove(local_final_state_tar_gz_path)
    print(f'Downloading and decompressing {sample_id} complete!')

Processing channel_bac_merfish__10__1__9__128__1000__20__fdr ...
channel_bac_merfish__10__1__9__128__1000__20__fdr is already downloaded and decompressed -- skipping.
Processing channel_bac_merfish__10__1__9__128__100__20__fdr ...
channel_bac_merfish__10__1__9__128__100__20__fdr is already downloaded and decompressed -- skipping.
Processing channel_bac_merfish__10__1__9__128__10__20__fdr ...
channel_bac_merfish__10__1__9__128__10__20__fdr is already downloaded and decompressed -- skipping.
Processing channel_bac_merfish__10__1__9__16__1000__20__fdr ...
channel_bac_merfish__10__1__9__16__1000__20__fdr is already downloaded and decompressed -- skipping.
Processing channel_bac_merfish__10__1__9__16__100__20__fdr ...
channel_bac_merfish__10__1__9__16__100__20__fdr is already downloaded and decompressed -- skipping.
Processing channel_bac_merfish__10__1__9__16__10__20__fdr ...
channel_bac_merfish__10__1__9__16__10__20__fdr is already downloaded and decompressed -- skipping.
Processing chann