In [14]:
import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging
import pandas as pd
import numpy as np
from s3path import S3Path
from pathlib import Path
from tqdm.notebook import tqdm
from packaging import version
pd.set_option("display.max_colwidth", 40)

In [15]:
from utils.utils import *

In [16]:
import glob

# Functions

## List FASTQs

In [17]:
# FASTQ reads/indices required for each workflow
fastq_map = {
    'MitoTracing': ['R1', 'R2'],
}

# Get fastq file paths on S3 for each file id
# Returns dictionary from id to s3 path
# Throws exception if FASTQs don't exist for any id
def get_fastqs(
    path: str, # path to directory containing FASTQ files
    fastq_file_ids: list, # FASTQ file ids needed for this run type (e.g. I1, R1, R2, etc.)

):
    fastq_map = dict()
    _, bucket, key, _, _ = urllib.parse.urlsplit(path)
    for fid in fastq_file_ids:
        files = get_s3_objects(
            bucket, key.lstrip("/"),
            re.compile(f"_{fid}_\d{{3}}.fastq.gz$")
        )
        try:
            assert files, f"AssertionError: Missing `{fid}` archives!"
            fastq_map[fid] = [os.path.join("s3://", bucket, str(f)) for f in files]
        except AssertionError as err:
            logging.warning("%s\n\t %s", err, path)
            return
    return fastq_map

In [18]:
def get_barcode_genomic_fastqs(samples):
    
    samples['fastq'] = np.empty((len(samples), 0)).tolist()
    fastqs = []
    for sample, row in samples.iterrows():
        S3_path = row['S3_path']
        
        # List FASTQ files
        fastq_file_ids = fastq_map['MitoTracing']        
        path_fastq = f"{S3_path}/FASTQ/"
        fastq = get_fastqs(path_fastq, fastq_file_ids)
        
        fastqs.append(fastq)
        
    samples['fastq'] = fastqs
    return samples

In [19]:
# Extract FASTQ sample name from list of files
# Note: FASTQ name is file name up to lane id (e.g. L001, L002, etc.)
def get_fastqs_name(fastqs):
    fastq_name_re = r".*/(.*)_S\d+_L\d{3}_[A-Za-z]\d_\d{3}.fastq.gz$"
    fastq_names = [re.match(fastq_name_re, x)[1] for x in fastqs]
    assert len(set(fastq_names)) == 1 # make sure all names are same
    return fastq_names[0]

## DB queries

In [20]:
# Common query col: id, request_id, Sample
def get_sample_name(query, query_col, creds):
    
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sample_data"
        query = f"""
        SELECT {table_sample_data}.Sample
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        sample_names = []
        results = execute_query(query, user, password)
        for result in results:
            sample_names.append(result[0])
        return sample_names
    except Error as e:
        print(f"Error: {e}")
    
    
def get_aws_path(query, query_col, creds):
    
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sample_data"
        query = f"""
        SELECT {table_sample_data}.AWS_storage
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        sample_paths = []
        results = execute_query(query, user, password)
        for result in results:
            sample_paths.append(result[0])
        return sample_paths
    except Error as e:
        print(f"Error: {e}")
        
            
def get_sample_id(query, query_col, creds):
    
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sample_data"
        query = f"""
        SELECT {table_sample_data}.id
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        sample_ids = []
        results = execute_query(query, user, password)
        for result in results:
            sample_ids.append(result[0])
        return sample_ids
    except Error as e:
        print(f"Error: {e}")
        
def format_sample_aws(querys, query_col, creds):
    sample_names = []
    sample_paths = []
    sample_ids = []
    
    for query in querys:
        sample_names += get_sample_name(query, query_col, creds)
        sample_paths += get_aws_path(query, query_col, creds)
        sample_ids += get_sample_id(query, query_col, creds)
        
    sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists
    
    samples = pd.DataFrame(
        dict(S3_path=sample_paths, Sample_ID=sample_ids),
        index=sample_names,
        dtype=str,
    )
    return samples

In [21]:
def get_genomeIndex_id(query, query_col, creds):
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sample_data"
        query = f"""
        SELECT {table_sample_data}.genomeIndex_id
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        
        results = execute_query(query, user, password)
        for result in results:
            genomeIndex_id = result[0]
            
        return genomeIndex_id
            
    except Error as e:
        print(f"Error: {e}")
        
def get_scTech_id(query, query_col, creds):
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.genome_index"
        query = f"""
        SELECT {table_sample_data}.scTech_id
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        
        results = execute_query(query, user, password)
        for result in results:
            scTech_id = result[0]
            
        return scTech_id
            
    except Error as e:
        print(f"Error: {e}")
        
def get_index(query, query_col, creds):
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.genome_index"
        query = f"""
        SELECT {table_sample_data}.gIndex
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        
        results = execute_query(query, user, password)
        for result in results:
            index = result[0]
            
        return index
            
    except Error as e:
        print(f"Error: {e}")
        
def get_assay(query, query_col, creds):
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sc_tech"
        query = f"""
        SELECT {table_sample_data}.Run_name
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        
        results = execute_query(query, user, password)
        for result in results:
            assay = result[0]
            
        return assay
            
    except Error as e:
        print(f"Error: {e}")
        
def get_barcode(query, query_col, creds):
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sc_tech"
        query = f"""
        SELECT {table_sample_data}.barcodes
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        
        results = execute_query(query, user, password)
        for result in results:
            barcode = result[0]
            
        return barcode
            
    except Error as e:
        print(f"Error: {e}")
        
        
def format_assay_barcode(samples, creds):
    
    # samples['genomeIndex_id'] = np.nan
    samples['index'] = np.nan
    # samples['scTech_id'] = np.nan
    samples['assay'] = np.nan
    samples['barcode'] = np.nan
    
    for sample, row in samples.iterrows():
        sample_id = row['Sample_ID']
        
        genomeIndex_id  = get_genomeIndex_id(sample_id, 'id', creds)
        index = get_index(genomeIndex_id, 'id', creds)
        scTech_id = get_scTech_id(genomeIndex_id, 'id', creds)
        assay = get_assay(scTech_id, 'id', creds)
        barcode = get_barcode(scTech_id, 'id', creds)
        
        # samples.loc[sample, 'genomeIndex_id'] = genomeIndex_id
        samples.loc[sample, 'index'] = index
        # samples.loc[sample, 'scTech_id'] = scTech_id
        samples.loc[sample, 'assay'] = assay
        samples.loc[sample, 'barcode'] = barcode

    return samples

In [22]:
def get_project_id(sample_id, creds):
    
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sample_data"
        table_project_data = "peer_lab_db.project_data"
        query = f"""
        SELECT {table_project_data}.projectName
        FROM {table_project_data}
        LEFT JOIN {table_sample_data}
        ON {table_project_data}.id = {table_sample_data}.projectData_id
        WHERE {table_sample_data}.id = {sample_id}
        """
        result = execute_query(query, user, password)[0][0]
        return result
    except Error as e:
        print(f"Error: {e}")

# Process Samples

## Setup

In [23]:
# Maps from .wdl name (prefix) to results dirname
results_dirs = {
    "MitoTracing": "mito-tracing-outs",
}

# Maps from .wdl name (prefix) to shell script
sh_files = {
    "MitoTracing": "submit-processing.sh",
}

In [24]:
# Location of docker files
common_docker_registry = "quay.io/hisplan"

prefix = "MitoTracing" # Workflow to run; also .wdl filename prefix
pipeline_type = prefix # field in *.labels.json
output_dirname = results_dirs[prefix]
template_prefix = 'template'

# If need to add comment, put here
comment = "sohailn"

In [25]:
# Locations of workflow-related directories and files
path_to_cromwell_secrets = f"{Path.home()}/.cromwell/cromwell-secrets.json" # CHANGE THIS
workflow_dir = glob.glob(f"{Path.home()}/scing/bin/*mito*")[0]
path_to_exec = sh_files[prefix]
config_dir = f"{workflow_dir}/configs"
path_to_options = glob.glob(f"{workflow_dir}/*.options.aws.json")[0]

# Other file locations
db_credentials_path = f"{Path.home()}/.config.json" # CHANGE THIS

In [26]:
# Set credentials based on SCRIdb CLI config file
with open(db_credentials_path) as f:
    creds = json.load(f)

In [27]:
%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMPLW5T6NP
%env AWS_SECRET_ACCESS_KEY=1SX9bRl+eDSWhPQZoE3veOnjWMS1q4pOi+H2h4CF
%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEKf//////////wEaCXVzLWVhc3QtMSJGMEQCIG8pw61xL0cjlnxgAmlhtJuGvjKrly9zppW9jmxWKct1AiBSCMeORMMiw34vMpf2Dy7lfNTer4K0ZYHjiIDezosCPSr4AQjw//////////8BEAMaDDU4MzY0MzU2NzUxMiIMcwOotqu75F46WP4sKswBtpwzuDdeano9gSn3VxI01yB4h7fxYVnQQiREf7FBGcmfYrMJ7B9DnL62fHTzdFdNRNsW9BETcVnuQrW5rIEtUJ50XQbsoWsRDBdF9DSXJL26qMr0l7ZOs4IkMn/MYfuGbN9OwRolOvPAh8ewB1N5NqOR+8QO29kRrshV11UBx5CfZwAL00os+sPJgweB+pWsniVheoPRaJLonnsTisSfQYRzoidAVL4ZkZv48eFzaodcAGD5M8VbZQpZ3njSQrmqKQ0jYP7fH0PytXxOMK3FipcGOpkB+HPz9ENB0ACPcyAAz9L3z3uWF/IqP8Q+UbU8Mov3FXhSQuTLEzdZQUxMtBiTT8MAFxyQKabxCrUKpctEWTFL6TeXIoexFp8BmqetPNFP0xwCNGEN/E6KaXs9PKQi5hak1EFX2TVvg3w8DJhsy9ApMktj/wNiU8+ZQvGXX2xIW/pakOBI4H9qoOwC9IIXofUbSFpino3BKg6w
!aws s3 ls

env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMPLW5T6NP
env: AWS_SECRET_ACCESS_KEY=1SX9bRl+eDSWhPQZoE3veOnjWMS1q4pOi+H2h4CF
env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEKf//////////wEaCXVzLWVhc3QtMSJGMEQCIG8pw61xL0cjlnxgAmlhtJuGvjKrly9zppW9jmxWKct1AiBSCMeORMMiw34vMpf2Dy7lfNTer4K0ZYHjiIDezosCPSr4AQjw//////////8BEAMaDDU4MzY0MzU2NzUxMiIMcwOotqu75F46WP4sKswBtpwzuDdeano9gSn3VxI01yB4h7fxYVnQQiREf7FBGcmfYrMJ7B9DnL62fHTzdFdNRNsW9BETcVnuQrW5rIEtUJ50XQbsoWsRDBdF9DSXJL26qMr0l7ZOs4IkMn/MYfuGbN9OwRolOvPAh8ewB1N5NqOR+8QO29kRrshV11UBx5CfZwAL00os+sPJgweB+pWsniVheoPRaJLonnsTisSfQYRzoidAVL4ZkZv48eFzaodcAGD5M8VbZQpZ3njSQrmqKQ0jYP7fH0PytXxOMK3FipcGOpkB+HPz9ENB0ACPcyAAz9L3z3uWF/IqP8Q+UbU8Mov3FXhSQuTLEzdZQUxMtBiTT8MAFxyQKabxCrUKpctEWTFL6TeXIoexFp8BmqetPNFP0xwCNGEN/E6KaXs9PKQi5hak1EFX2TVvg3w8DJhsy9ApMktj/wNiU8+ZQvGXX2xIW/pakOBI4H9qoOwC9IIXofUbSFpino3BKg6w
2021-10-07 15:31:32 agc-583643567512-us-east-1
2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4
2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1
2019-10-10 

# Excution

## Sample information

In [28]:
# Common query col: id, request_id, Sample
sample_id = list(range(3924, 3934))

samples = format_sample_aws(sample_id, 'id', creds)
samples

Unnamed: 0,S3_path,Sample_ID
AV-1759_Ru1083_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3924
AV-1760_MSK_LX_1083c_T_2_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3925
AV-1761_POSIE_101920_T_1_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3926
AV-1762_Ru1083d_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3927
AV-1763_Ru1250C_T_1_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3928
AV-1764_MSK_LX_1250b_PM_1_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3929
AV-1764_Ru1250D_T_1_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3930
AV-1765_Ru1250e_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3931
AV-1766_MSK_LX_1250f_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3932
AV-1760_Ru263_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3933


In [16]:
# # Modification for Joe's samples
# samples.loc['Ru581b_T1_MITO', 'S3_path'] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO'
# samples.loc['Ru581c-LN1_MITO', 'S3_path'] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO'

# samples

In [29]:
# Re-running some of the samples
# samples = samples.loc[samples.index.isin(['AV-1760_Ru263_MITO', 'AV-1762_Ru1083d_MITO'])]

# barcodes_in_RNA.non-epithelial.RU1083_ST.txt
# samples = samples.loc[samples.index.isin(['AV-1762_Ru1083d_MITO'])]


# barcodes_in_RNA.epithelial.RU1250_ASC1.txt
samples = samples.loc[samples.index.isin(['AV-1765_Ru1250e_MITO'])]
samples

Unnamed: 0,S3_path,Sample_ID
AV-1765_Ru1250e_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3931


In [30]:
samples = format_assay_barcode(samples, creds)
samples

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['index'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['assay'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['barcode'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

S

Unnamed: 0,S3_path,Sample_ID,index,assay,barcode
AV-1765_Ru1250e_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3931,GRCh38-3.0.0,CR,


In [31]:
samples = get_barcode_genomic_fastqs(samples)
samples

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['fastq'] = np.empty((len(samples), 0)).tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples['fastq'] = fastqs


Unnamed: 0,S3_path,Sample_ID,index,assay,barcode,fastq
AV-1765_Ru1250e_MITO,s3://dp-lab-data/SCRI_Projects/HTA/M...,3931,GRCh38-3.0.0,CR,,{'R1': ['s3://dp-lab-data/SCRI_Proje...


In [20]:
# samples = pd.DataFrame(samples.loc['Ru581b_T1_MITO']).T
# samples

## Make input file

In [32]:
# Load minimum inputs and labels fields from input template
with open(f"{config_dir}/{template_prefix}.inputs.json") as f:
    std_inputs_fields = list(json.load(f).keys())
    
# Annotate all samples with workflow inputs and labels
inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)
inputs

Unnamed: 0,MitoTracing.sampleName,MitoTracing.fastqName,MitoTracing.fastqR1,MitoTracing.fastqR2,MitoTracing.reference,MitoTracing.includeIntrons,MitoTracing.expectCells,MitoTracing.numCores,MitoTracing.memory,MitoTracing.whitelist,MitoTracing.dockerRegistry
AV-1765_Ru1250e_MITO,,,,,,,,,,,


In [33]:
# Default inputs
inputs[f"{prefix}.includeIntrons"] = False
inputs[f"{prefix}.expectCells"] = 5000
inputs[f"{prefix}.numCores"] = 16
# inputs[f"{prefix}.memory"] = 128
inputs[f"{prefix}.memory"] = 256 

inputs[f"{prefix}.dockerRegistry"] = common_docker_registry

inputs

Unnamed: 0,MitoTracing.sampleName,MitoTracing.fastqName,MitoTracing.fastqR1,MitoTracing.fastqR2,MitoTracing.reference,MitoTracing.includeIntrons,MitoTracing.expectCells,MitoTracing.numCores,MitoTracing.memory,MitoTracing.whitelist,MitoTracing.dockerRegistry
AV-1765_Ru1250e_MITO,,,,,,False,5000,16,256,,quay.io/hisplan


In [34]:
# Sample information
inputs[f"{prefix}.sampleName"] = samples.index.tolist()

inputs[f"{prefix}.fastqR1"] = samples['fastq'].str['R1']
inputs[f"{prefix}.fastqR2"] = samples['fastq'].str['R2']

for sample in inputs.index:
    fastqR1_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR1'])
    fastqR2_name = get_fastqs_name(inputs.loc[sample, 'MitoTracing.fastqR2'])
    
    assert(fastqR1_name == fastqR1_name)
    inputs.loc[sample, f"{prefix}.fastqName"] = fastqR1_name

inputs

Unnamed: 0,MitoTracing.sampleName,MitoTracing.fastqName,MitoTracing.fastqR1,MitoTracing.fastqR2,MitoTracing.reference,MitoTracing.includeIntrons,MitoTracing.expectCells,MitoTracing.numCores,MitoTracing.memory,MitoTracing.whitelist,MitoTracing.dockerRegistry
AV-1765_Ru1250e_MITO,AV-1765_Ru1250e_MITO,3931_AV-1765_Ru1250e_MITO_IGO_13388_9,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,,False,5000,16,256,,quay.io/hisplan


In [35]:
# Reference package created by Jaeyoung

cellRangerRefPkg="s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/refdata-gex-mito-GRCh38-ensemble98.tar.gz"
mitoFastaRefPkg="s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/reference-packages/mito-fasta-GRCh38.tar.gz"

reference = [dict(cellRangerRefPkg=cellRangerRefPkg,
                  mitoFastaRefPkg=mitoFastaRefPkg)] * len(inputs)
inputs[f"{prefix}.reference"] = reference

inputs

Unnamed: 0,MitoTracing.sampleName,MitoTracing.fastqName,MitoTracing.fastqR1,MitoTracing.fastqR2,MitoTracing.reference,MitoTracing.includeIntrons,MitoTracing.expectCells,MitoTracing.numCores,MitoTracing.memory,MitoTracing.whitelist,MitoTracing.dockerRegistry
AV-1765_Ru1250e_MITO,AV-1765_Ru1250e_MITO,3931_AV-1765_Ru1250e_MITO_IGO_13388_9,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,256,,quay.io/hisplan


## Check whitelist

In [38]:
epithelial = True
for sample in inputs.index:
    for w in whitelist[sample]:
        
        if epithelial:
            if '.epithelial.' in w:
                sample_whitelist = w
        else:
            if '.non-epithelial.' in w:
                sample_whitelist = w
          
    inputs.loc[sample, f"{prefix}.whitelist"] = sample_whitelist

In [39]:
inputs

Unnamed: 0,MitoTracing.sampleName,MitoTracing.fastqName,MitoTracing.fastqR1,MitoTracing.fastqR2,MitoTracing.reference,MitoTracing.includeIntrons,MitoTracing.expectCells,MitoTracing.numCores,MitoTracing.memory,MitoTracing.whitelist,MitoTracing.dockerRegistry
AV-1765_Ru1250e_MITO,AV-1765_Ru1250e_MITO,3931_AV-1765_Ru1250e_MITO_IGO_13388_9,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,256,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan


In [38]:
# non_epithelial = ['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/barcodes_in_RNA.non-epithelial.RU581_Ta.txt',
#                   's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO/barcodes_in_RNA.non-epithelial.RU581_LNa.txt',
#                   's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/barcodes_in_RNA.non-epithelial.RU581_LIV.txt']

# epithelial = ['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/barcodes_in_RNA.RU581_Ta.txt',
#               's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581c-LN1_MITO/barcodes_in_RNA.RU581_LNa.txt',
#               's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/barcodes_in_RNA.RU581_LIV.txt']

In [39]:
# # Whitelist needs to be in SEQC barcodes (nucleotide) format rather than CR format
# # Path to download the whitelist file
# path_whitelist = '/Users/sohailn/scing/bin/wdl-mito-tracing/whitelist/'

# # for path_s3 in epithelial:
# for path_s3 in non_epithelial:
#     path_s3_dirname = os.path.dirname(path_s3)
#     sample_name = os.path.basename(path_s3_dirname)

#     filename_in = os.path.basename(path_s3)
#     filename_out = filename_in.replace('.txt', '.seqc.txt')

#     # Download barcode file
#     cmd = f'aws s3 cp {path_s3} {path_whitelist}{filename_in}'
#     os.system(cmd)

#     barcodes = pd.read_csv(f'{path_whitelist}{filename_in}', header=None)[0].tolist()

#     from seqc.sequence.encodings import DNA3Bit
#     dna3bit = DNA3Bit()
#     barcodes = [dna3bit.decode(x) for x in barcodes]
    
#     with open(f'{path_whitelist}{filename_out}', "w") as output:
#         for barcode in barcodes:
#             output.write(f'{barcode.decode("utf-8")}-1\n')

#     cmd = f'aws s3 cp {path_whitelist}{filename_out} {path_s3_dirname}/{filename_out}'
#     inputs.loc[sample_name, f"{prefix}.whitelist"] = f'{path_s3_dirname}/{filename_out}'
    
#     os.system(cmd)
#     print()

In [40]:
# inputs.loc['Ru581b_T1_MITO', f"{prefix}.whitelist"] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581b_T1_MITO/barcodes_in_RNA.RU581_Ta.seqc.txt'

In [40]:
inputs[f"{prefix}.whitelist"].values

array(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/barcodes_in_RNA.epithelial.RU1250_ASC1.txt'],
      dtype=object)

In [41]:
if epithelial:
    inputs[f"{prefix}.sampleName"] = 'epi_' + inputs[f"{prefix}.sampleName"] 
else:
    inputs[f"{prefix}.sampleName"] = 'non_epi_' + inputs[f"{prefix}.sampleName"]
inputs[f"{prefix}.sampleName"].values

array(['epi_AV-1765_Ru1250e_MITO'], dtype=object)

## Make label file

In [42]:
# Load minimum inputs and labels fields from labels template
with open(f"{config_dir}/{template_prefix}.labels.json") as f:
    std_labels_fields = list(json.load(f).keys())
    
# Annotate all samples with workflow labels
labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)
labels

Unnamed: 0,pipelineType,project,sample,owner,destination,transfer,comment
AV-1765_Ru1250e_MITO,,,,,,,


In [43]:
# Annotate labels
labels["pipelineType"] = pipeline_type
labels["project"] = samples["Sample_ID"].apply(lambda x: get_project_id(x, creds))
labels["sample"] = labels.index
labels["owner"] = creds["user"]
# labels["destination"] = samples['S3_path'] + "/" + output_dirname

if epithelial:
    labels["destination"] = samples['S3_path'] + "/" + 'epi_' + output_dirname
else:
    labels["destination"] = samples['S3_path'] + "/" + 'non_epi_' + output_dirname

labels["transfer"] = "-"
labels["comment"] = creds["user"]

labels

Unnamed: 0,pipelineType,project,sample,owner,destination,transfer,comment
AV-1765_Ru1250e_MITO,MitoTracing,Lung Tumor Atlas,AV-1765_Ru1250e_MITO,sohailn,s3://dp-lab-data/SCRI_Projects/HTA/M...,-,sohailn


In [44]:
# labels.loc['Ru581D_MITO', 'destination'] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/non_epi_mito-tracing-outs'
# labels.loc['Ru581D_MITO', 'destination'] = 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/barcodes_rna/Ru581D_MITO/epi_mito-tracing-outs'

labels['destination'].values

array(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/epi_mito-tracing-outs'],
      dtype=object)

# Submit job

In [95]:
inputs_all = inputs.copy()
labels_all = labels.copy()

In [96]:
sample_names = inputs.index.tolist()
sample_names.sort()
sample_names

['AV-1759_Ru1083_MITO',
 'AV-1760_MSK_LX_1083c_T_2_MITO',
 'AV-1760_Ru263_MITO',
 'AV-1761_POSIE_101920_T_1_MITO',
 'AV-1762_Ru1083d_MITO',
 'AV-1763_Ru1250C_T_1_MITO',
 'AV-1764_MSK_LX_1250b_PM_1_MITO',
 'AV-1764_Ru1250D_T_1_MITO',
 'AV-1765_Ru1250e_MITO',
 'AV-1766_MSK_LX_1250f_MITO']

In [97]:
inputs = pd.DataFrame(inputs_all.loc[sample_names[0:]])
inputs

Unnamed: 0,MitoTracing.sampleName,MitoTracing.fastqName,MitoTracing.fastqR1,MitoTracing.fastqR2,MitoTracing.reference,MitoTracing.includeIntrons,MitoTracing.expectCells,MitoTracing.numCores,MitoTracing.memory,MitoTracing.whitelist,MitoTracing.dockerRegistry
AV-1759_Ru1083_MITO,non_epi_AV-1759_Ru1083_MITO,3924_AV-1759_Ru1083_MITO_IGO_13388_1,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan
AV-1760_MSK_LX_1083c_T_2_MITO,non_epi_AV-1760_MSK_LX_1083c_T_2_MITO,3925_AV-1760_MSK_LX_1083c_T_2_MITO_I...,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan
AV-1760_Ru263_MITO,non_epi_AV-1760_Ru263_MITO,3933_AV-1760_Ru263_MITO_IGO_13388_7,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan
AV-1761_POSIE_101920_T_1_MITO,non_epi_AV-1761_POSIE_101920_T_1_MITO,3926_AV-1761_POSIE_101920_T_1_MITO_I...,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan
AV-1762_Ru1083d_MITO,non_epi_AV-1762_Ru1083d_MITO,3927_AV-1762_Ru1083d_MITO_IGO_13388_4,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan
AV-1763_Ru1250C_T_1_MITO,non_epi_AV-1763_Ru1250C_T_1_MITO,3928_AV-1763_Ru1250C_T_1_MITO_IGO_13...,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan
AV-1764_MSK_LX_1250b_PM_1_MITO,non_epi_AV-1764_MSK_LX_1250b_PM_1_MITO,3929_AV-1764_MSK_LX_1250b_PM_1_MITO_...,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan
AV-1764_Ru1250D_T_1_MITO,non_epi_AV-1764_Ru1250D_T_1_MITO,3930_AV-1764_Ru1250D_T_1_MITO_IGO_13...,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan
AV-1765_Ru1250e_MITO,non_epi_AV-1765_Ru1250e_MITO,3931_AV-1765_Ru1250e_MITO_IGO_13388_9,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan
AV-1766_MSK_LX_1250f_MITO,non_epi_AV-1766_MSK_LX_1250f_MITO,3932_AV-1766_MSK_LX_1250f_MITO_IGO_1...,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,128,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan


In [98]:
inputs[f'{prefix}.fastqR1'].values

array([list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/FASTQ/3924_AV-1759_Ru1083_MITO_IGO_13388_1_S1_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/FASTQ/3924_AV-1759_Ru1083_MITO_IGO_13388_1_S1_L002_R1_001.fastq.gz']),
       list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/FASTQ/3925_AV-1760_MSK_LX_1083c_T_2_MITO_IGO_13388_2_S2_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/FASTQ/3925_AV-1760_MSK_LX_1083c_T_2_MITO_IGO_13388_2_S2_L002_R1_001.fastq.gz']),
       list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/FASTQ/3933_AV-1760_Ru263_MITO_IGO_13388_7_S6_L001_R1_001.fastq.gz', 's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/FASTQ/3933_AV-1760_Ru263_MITO_IGO_13388_7_S6_L002_R1_001.fastq.gz']),
       list(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO

In [99]:
inputs[f'{prefix}.whitelist'].values

array(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/barcodes_in_RNA.non-epithelial.RU1083_LIV.txt',
       's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/barcodes_in_RNA.non-epithelial.RU1083_T2.txt',
       's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_Ru263_MITO/barcodes_in_RNA.non-epithelial.RU263_PDX.txt',
       's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/barcodes_in_RNA.non-epithelial.RU1083_T1.txt',
       's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/barcodes_in_RNA.non-epithelial.RU1083_ST.txt',
       's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1763_Ru1250C_T_1_MITO/barcodes_in_RNA.non-epithelial.RU1250_T1.txt',
       's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_MSK_LX_1250b_PM_1_MITO/barcodes_in_RNA.non-epithelial.RU1250_PL.txt',
       's3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1764_Ru1250D_T_1_MITO/barcodes_in_RNA.non-e

In [33]:
labels = pd.DataFrame(labels_all.loc[sample_names[0:]])
labels

NameError: name 'labels_all' is not defined

In [45]:
inputs

Unnamed: 0,MitoTracing.sampleName,MitoTracing.fastqName,MitoTracing.fastqR1,MitoTracing.fastqR2,MitoTracing.reference,MitoTracing.includeIntrons,MitoTracing.expectCells,MitoTracing.numCores,MitoTracing.memory,MitoTracing.whitelist,MitoTracing.dockerRegistry
AV-1765_Ru1250e_MITO,epi_AV-1765_Ru1250e_MITO,3931_AV-1765_Ru1250e_MITO_IGO_13388_9,[s3://dp-lab-data/SCRI_Projects/HTA/...,[s3://dp-lab-data/SCRI_Projects/HTA/...,{'cellRangerRefPkg': 's3://dp-lab-da...,False,5000,16,256,s3://dp-lab-data/SCRI_Projects/HTA/M...,quay.io/hisplan


In [46]:
labels

Unnamed: 0,pipelineType,project,sample,owner,destination,transfer,comment
AV-1765_Ru1250e_MITO,MitoTracing,Lung Tumor Atlas,AV-1765_Ru1250e_MITO,sohailn,s3://dp-lab-data/SCRI_Projects/HTA/M...,-,sohailn


In [47]:
labels['destination'].values

array(['s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1765_Ru1250e_MITO/epi_mito-tracing-outs'],
      dtype=object)

In [48]:
# Setting cache to true to hopefully save some time
path_to_options

'/Users/sohailn/scing/bin/wdl-mito-tracing/MitoTracing.options.aws.json'

In [49]:
import time

assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())
assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())

stdouts = [] # to store all outputs
process = True

with tqdm(inputs.index) as t:

    for sample_name in t:

        # Write inputs and labels to file
        path_to_inputs = f"{config_dir}/{sample_name}_{prefix}.inputs.json"
        with open(path_to_inputs, "w") as f_inputs:
            json.dump(inputs.loc[sample_name].to_dict(), f_inputs, indent=4, cls=NpEncoder)

        path_to_labels = f"{config_dir}/{sample_name}_{prefix}.labels.json"
        with open(path_to_labels, "w") as f_labels:
            json.dump(labels.loc[sample_name].to_dict(), f_labels, indent=4, cls=NpEncoder)

        if process:
            stdouts.append(run(
                workflow_path = workflow_dir,
                execp = path_to_exec,
                secrets = path_to_cromwell_secrets,
                inputs = path_to_inputs,
                labels = path_to_labels,
                options = path_to_options,
            ))
        # Sometimes causes problems if too many samples are run at once
        time.sleep(20) # Delay for 20 seconds.

  0%|          | 0/1 [00:00<?, ?it/s]

In [50]:
print(path_to_inputs)
print(path_to_labels)

/Users/sohailn/scing/bin/wdl-mito-tracing/configs/AV-1765_Ru1250e_MITO_MitoTracing.inputs.json
/Users/sohailn/scing/bin/wdl-mito-tracing/configs/AV-1765_Ru1250e_MITO_MitoTracing.labels.json


In [51]:
stdouts

[{'args': ['/Users/sohailn/scing/bin/wdl-mito-tracing/submit-processing.sh',
   '-k',
   '/Users/sohailn/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/sohailn/scing/bin/wdl-mito-tracing/configs/AV-1765_Ru1250e_MITO_MitoTracing.inputs.json',
   '-l',
   '/Users/sohailn/scing/bin/wdl-mito-tracing/configs/AV-1765_Ru1250e_MITO_MitoTracing.labels.json',
   '-o',
   '/Users/sohailn/scing/bin/wdl-mito-tracing/MitoTracing.options.aws.json'],
  'returncode': 0,
  'stdout': '{"id":"91bee57e-4aaa-40ce-9723-e2526cc16fbf","status":"Submitted"}\n',
  'stderr': ''}]

In [85]:
# Making sample.csv
# Common query col: id, request_id, Sample
sample_id = list(range(3924, 3934))
samples = format_sample_aws(sample_id, 'id', creds)

for sample in samples.index:
    s3_path = samples[samples.index == sample]['S3_path'].item()
    sample = sample[8:]
    
    print(f'epi_{sample}, {s3_path}/epi_{output_dirname}')
    print(f'non_epi_{sample}, {s3_path}/non_epi_{output_dirname}')

epi_Ru1083_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/epi_mito-tracing-outs
non_epi_Ru1083_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1759_Ru1083_MITO/non_epi_mito-tracing-outs
epi_MSK_LX_1083c_T_2_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/epi_mito-tracing-outs
non_epi_MSK_LX_1083c_T_2_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1760_MSK_LX_1083c_T_2_MITO/non_epi_mito-tracing-outs
epi_POSIE_101920_T_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/epi_mito-tracing-outs
non_epi_POSIE_101920_T_1_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1761_POSIE_101920_T_1_MITO/non_epi_mito-tracing-outs
epi_Ru1083d_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/epi_mito-tracing-outs
non_epi_Ru1083d_MITO, s3://dp-lab-data/SCRI_Projects/HTA/Mito_tracing/AV-1762_Ru1083d_MITO/non_epi_mito-tracing-outs
epi_Ru1250C_T_1_MITO, s3://d