In [44]:
import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging
import pandas as pd
import numpy as np
from s3path import S3Path
from pathlib import Path
from tqdm.notebook import tqdm
from packaging import version
pd.set_option("display.max_colwidth", 40)

# Define Helper Functions

In [45]:
# Priority of GEX data if multiple outputs are found in db
sharp_wl_priority_map = {
        m: ["SEQC", "CR_GEX"] for m in ["Hashtag", "CiteSeq"]
    }
# File patterns to search for in S3 for each accompanying pipeline
sharp_wl_pattern_map = {
    "SEQC": "_dense.csv$",
    "CR_GEX": "/filtered_feature_bc_matrix/barcodes.tsv.gz$",
    "CR_ATAC": "/filtered_peak_bc_matrix/barcodes.tsv"
}
sharp_wl_method_map = {
    "SEQC": "SeqcDenseCountsMatrixCsv",
    "CR_GEX": "10x",
    "CR_ATAC": "10x",
}
# Names of FASTQ inputs in WDL; order is same as fastq_file_ids
# TODO: Ask to change all inputs to "fastq{file_id}" or "uriFastq{file_id}"
sharp_fastq_inputs_map = {
    m: ["uriFastqR1", "uriFastqR2"] for m in ["Hashtag", "CiteSeq"]
}

In [46]:
# FASTQ reads/indices required for each workflow
fastq_map = {
    'Hashtag': ['R1', 'R2'],
    'CiteSeq': ['R1', 'R2'],
    'AsapSeq': ['R1', 'R2', 'R3'],
}

In [47]:
# Get fastq file paths on S3 for each file id
# Returns dictionary from id to s3 path
# Throws exception if FASTQs don't exist for any id
def get_fastqs(
    path: str, # path to directory containing FASTQ files
    fastq_file_ids: list, # FASTQ file ids needed for this run type (e.g. I1, R1, R2, etc.)
):
    fastq_map = dict()
    _, bucket, key, _, _ = urllib.parse.urlsplit(path)
    for fid in fastq_file_ids:
        files = get_s3_objects(
            bucket, key.lstrip("/"),
            re.compile(f"_{fid}_\d{{3}}.fastq.gz$")
        )
        try:
            assert files, f"AssertionError: Missing `{fid}` archives!"
            fastq_map[fid] = [os.path.join("s3://", bucket, str(f)) for f in files]
        except AssertionError as err:
            logging.warning("%s\n\t %s", err, path)
            return
    return fastq_map

In [48]:
# Get s3 path of existing GEX analysis files
from mysql.connector import connect, Error

def get_wl_dir(sample_id, user, password):
    try:
        table_sample_data = "peer_lab_db.sample_data"
        table_stats_data = "peer_lab_db.stats_data"
        table_stats_data = "peer_lab_db.stats_data"
        table_hashtag_lib = "peer_lab_db.hashtag_lib"
        table_genome_index = "peer_lab_db.genome_index"
        table_sc_tech = "peer_lab_db.sc_tech"
        query = f"""
        SELECT {table_stats_data}.analysis_storage
        FROM {table_sample_data}
        LEFT JOIN {table_stats_data} 
        ON {table_stats_data}.sampleData_id = {table_sample_data}.id
        LEFT JOIN {table_hashtag_lib}
        ON {table_hashtag_lib}.sampleData_id = {table_sample_data}.id
        LEFT JOIN {table_genome_index}
        ON {table_genome_index}.id = {table_hashtag_lib}.genomeIndex_id
        LEFT JOIN {table_sc_tech}
        ON {table_sc_tech}.id = {table_genome_index}.scTech_id
        WHERE {table_sample_data}.id = {sample_id}
        """
        result = execute_query(query, user, password)[0][0]
        if result: 
            return result
        # As backup, get AWS storage location directly from sample_data
        else:
            query = f"""
            SELECT AWS_storage
            FROM {table_sample_data}
            WHERE {table_sample_data}.id = {sample_id}
            """
            result = execute_query(query, user, password)[0][0]
            return result
    except Error as e:
        print(f"Error: {e}")

In [49]:
# Get white list method and associated file
# Throws exception if no white list exists
def get_wl_params(
    sample_id: str,
    user: str,
    password: str,
):
    wl_params = dict()

    wl_dir = get_wl_dir(sample_id, user, password)
    wl_patterns = [sharp_wl_pattern_map[p] for p in sharp_wl_priority_map[prefix]]

    try:
        # Check white list file exists before loading info from database
        assert wl_dir, f"Empty analysis storage for sample id {sample_id}"
        _, bucket, key, _, _ = urllib.parse.urlsplit(wl_dir)
        # White list file and method is first entry found on S3 
        wl = pd.DataFrame(
            [get_s3_objects(bucket, key.strip("/"), re.compile(p)) for p in wl_patterns],
            index = sharp_wl_priority_map[prefix],
        ).dropna(how="all")
        try:
            wl_key = wl.iloc[0,0] # if empty, missing white list file
            wl_params["uri"] = os.path.join("s3://", bucket, wl_key)
            wl_params["method"] = sharp_wl_method_map[wl.index[0]]
        except IndexError:
            logging.error(
                "Path to barcodes or counts matrix of GEX data is missing!"
            )
            return

    except AssertionError:
        logging.warning(f"Path to GEX output results is missing for {sample_id}!")
        return

    return wl_params

In [50]:
def get_bc_json_manual(
    prefix,
    platform,
    user, 
    password,
):
    try:
        table_sc_tech = "peer_lab_db.sc_tech"
        query = f"""
        SELECT barcodes
        FROM {table_sc_tech}
        WHERE {table_sc_tech}.sc_Tech = "{platform.upper()}_{prefix}"
        """
        result = execute_query(query, user, password)[0][0]
        return result
    except Error as e:
        print(f"Error: {e}")

In [51]:
# Reformat barcodes from collaborator excel file
def get_bcs_manual(
    path_to_excel,
):
    # Replace unrecognized characters with text description
    barcodes = pd.read_excel(path_to_excel) 
    replace = lambda x: x.encode('ascii', 'namereplace').decode().replace("\\N", "")
    barcodes["Description"] = barcodes["Description"].apply(replace)
    barcodes["BP Shift"] = 0
    return barcodes[["Barcode", "DNA_ID", "Description", "BP Shift"]]

In [52]:
def get_bc_params_manual(
    barcodes,
    prefix, 
    platform,
    creds,
):
    bc_params = dict()
    
    # Add bp shift and sequence length based on conjugation
    bp_shift_map = {
        "A": 0,
        "B": 10,
        "C": 10,
        "M": 1, # Methanol
    }
    conjugation = barcodes["DNA_ID"].str.get(0)
    if conjugation.nunique() != 1:
        logging.warning(
            f"Sample has multiple hashtag barcode categories and will not be processed!"
        )
        return
    else:
        bc_params["conjugation"] = conjugation.values[0]
        bc_params["bp_shift"] = bp_shift_map[bc_params["conjugation"]]
        bc_params["seq_length"] = bc_params["bp_shift"] + barcodes["Barcode"].apply(len).max()
    
    # JSON of bc and UMI positions are stored in database
    bc_json = get_bc_json_manual(prefix, platform, creds['user'], creds['password'])
    bc_pos = json.loads(bc_json)
    bc_params["cb"] = bc_pos["cellbarcode"]
    bc_params["umi"] = bc_params["cb"] + bc_pos["UMIs"]
    
    return bc_params

In [53]:
def get_template(
    prefix: str, 
    conjugation: str = None,
):
    if prefix=="Hashtag":
        return {
            "A": "hashtag-10x-v3-tsa",
            "B": "hashtag-10x-v3-tsb",
            "C": "hashtag-10x-tsc",
        }[conjugation]
    elif prefix=="CiteSeq":
        return "citeseq"

In [54]:
# Numpy encoder for JSON from pandas series
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)

In [55]:
# from SCRIdb
def get_s3_objects(bucket, key, pattern, full_uri=False):
    
    s3r = boto3.resource("s3")
    bucket_s3 = s3r.Bucket(bucket)
    objects = []
    for obj in bucket_s3.objects.filter(Prefix=key):
        hit = pattern.search(obj.key)
        if hit:
            objects.append(obj.key)
    if full_uri:
        objects = [f"s3://{bucket}/{o}" for o in objects]
    return objects

In [56]:
def execute_query(query, user, password):
    with connect(
        host="peer-lab-db.cggxmlwgzzpw.us-east-1.rds.amazonaws.com",
        database="peer_lab_db",
        user=user,
        password=password,
    ) as connection:
        with connection.cursor(buffered=True) as cursor:
            cursor.execute(query)
            result = cursor.fetchall()
    return result

In [57]:
# Get species from database for given sample
from mysql.connector import connect, Error

def get_species(sample_id, user, password):
    try:
        table_sample_data = "peer_lab_db.sample_data"
        table_species = "peer_lab_db.species"
        table_genome_idx = "peer_lab_db.genome_index"
        query = f"""
        SELECT {table_species}.Species
        FROM {table_species}
        LEFT JOIN {table_genome_idx}
        ON {table_species}.id = {table_genome_idx}.species_id
        LEFT JOIN {table_sample_data}
        ON {table_genome_idx}.id = {table_sample_data}.genomeIndex_id
        WHERE {table_sample_data}.id = {sample_id}
        """
        result = execute_query(query, user, password)[0][0]
        return result
    except Error as e:
        print(f"Error: {e}")

In [58]:
# Get species from database for given sample
from mysql.connector import connect, Error

def get_sc_tech(sample_id, user, password):
    try:
        table_sample_data = "peer_lab_db.sample_data"
        table_sc_tech = "peer_lab_db.sc_tech"
        table_genome_idx = "peer_lab_db.genome_index"
        query = f"""
        SELECT {table_sc_tech}.sc_Tech
        FROM {table_sc_tech}
        LEFT JOIN {table_genome_idx}
        ON {table_sc_tech}.id = {table_genome_idx}.scTech_id
        LEFT JOIN {table_sample_data}
        ON {table_genome_idx}.id = {table_sample_data}.genomeIndex_id
        WHERE {table_sample_data}.id = {sample_id}
        """
        result = execute_query(query, user, password)[0][0]
        return result
    except Error as e:
        print(f"Error: {e}")

In [59]:
# Get species from database for given sample
from mysql.connector import connect, Error

def get_sample_id(sample_name, user, password):
    try:
        table_sample_data = "peer_lab_db.sample_data"
        query = f"""
        SELECT {table_sample_data}.id
        FROM {table_sample_data}
        WHERE {table_sample_data}.Sample="{sample_name}"
        """
        result = execute_query(query, user, password)[0][0]
        return result
    except Error as e:
        print(f"Error: {e}")

In [60]:
# Get species from database for given sample
from mysql.connector import connect, Error

def get_project_id(sample_id, user, password):
    try:
        table_sample_data = "peer_lab_db.sample_data"
        table_project_data = "peer_lab_db.project_data"
        query = f"""
        SELECT {table_project_data}.projectName
        FROM {table_project_data}
        LEFT JOIN {table_sample_data}
        ON {table_project_data}.id = {table_sample_data}.projectData_id
        WHERE {table_sample_data}.id = {sample_id}
        """
        result = execute_query(query, user, password)[0][0]
        return result
    except Error as e:
        print(f"Error: {e}")

In [61]:
def get_SEQC_version(loc):
    try:
        cmd = f"aws s3 cp {loc}/seqc-results/seqc_log.txt -"
        out = subprocess.run(shlex.split(cmd), universal_newlines=True, capture_output=True).__dict__["stdout"]
        version = re.match(r".*SEQC=v(\d+\.\d+\.\d+).*", out)[1]
        return version
    except:
        return "N/A"

In [62]:
def get_file_prefix(loc):
    try:
        cmd = f"aws s3 ls {loc}/seqc-results/"
        out = subprocess.run(shlex.split(cmd), universal_newlines=True, capture_output=True).__dict__["stdout"]
        
        # Note: I'm expecting the aligned bam file to be in loc
        bam_pattern = re.compile(r"(.*)_Aligned\.out\.bam$")
        filename = list(filter(bam_pattern.match, out.split()))[0]
        file_prefix = re.match(bam_pattern, filename)[1]
        return file_prefix
    except:
        raise ValueError(f"BAM file not found in {loc}")
        return ""

In [63]:
def get_reference(sample_id):
    # Get species from database to decide reference
    species = get_species(sample_id, creds["user"], creds["password"])
    
    # Map to reference locations
    if "Human" in species:
        return "s3://seqc-public/genomes/hg38_long_polya/annotations.gtf"
    elif "Mouse" in species:
        return "s3://seqc-public/genomes/mm38_long_polya/annotations.gtf"
    else:
        raise ValueError(f"Unknown Species: {species}")

In [64]:
def get_bc_whitelist(sample_id):
    # Get version from database to decide whitelist
    sc_tech = get_sc_tech(sample_id, creds["user"], creds["password"])
    
    # Map to reference locations
    if "V3" in sc_tech:
        return "s3://seqc-public/barcodes/ten_x_v3/flat/3M-february-2018.txt"
    elif "V2" in sc_tech:
        return "s3://seqc-public/barcodes/ten_x_v2/flat/737K-august-2016.txt"
    else:
        raise ValueError(f"Unknown Technology: {sc_tech}")

In [65]:
def run(
    workflow_path: str,
    execp: str,
    secrets: str,
    inputs: str,
    labels: str,
    options: str,
):
    # change working directory to the pipeline package
    oldwd = os.getcwd()
    os.chdir(workflow_path)
    
    # execute the pipeline command
    cmd = f"{workflow_path}/{execp} -k {secrets} -i {inputs} -l {labels} -o {options}"
    var = subprocess.run(shlex.split(cmd), universal_newlines=True, capture_output=True)
    out = var.__dict__
    
    # change working directory back
    os.chdir(oldwd)
    
    return out

In [66]:
# Get bc and UMI positions from database stored in JSON format
def get_bc_json(sample_id, user, password):
    try:
        table_sample_data = "peer_lab_db.sample_data"
        table_stats_data = "peer_lab_db.stats_data"
        table_stats_data = "peer_lab_db.stats_data"
        table_hashtag_lib = "peer_lab_db.hashtag_lib"
        table_genome_index = "peer_lab_db.genome_index"
        table_sc_tech = "peer_lab_db.sc_tech"
        query = f"""
        SELECT barcodes
        FROM {table_sample_data}
        LEFT JOIN {table_stats_data} 
        ON {table_stats_data}.sampleData_id = {table_sample_data}.id
        LEFT JOIN {table_hashtag_lib}
        ON {table_hashtag_lib}.sampleData_id = {table_sample_data}.id
        LEFT JOIN {table_genome_index}
        ON {table_genome_index}.id = {table_hashtag_lib}.genomeIndex_id
        LEFT JOIN {table_sc_tech}
        ON {table_sc_tech}.id = {table_genome_index}.scTech_id
        WHERE {table_sample_data}.id = {sample_id}
        """
        result = execute_query(query, user, password)[0][0]
        return result
    except Error as e:
        print(f"Error: {e}")

In [67]:
# Get bc sequence data from database
def get_bcs(sample_id, user, password):
    try:
        table_sample_data = "peer_lab_db.sample_data"
        table_hashtag_barcodes = "peer_lab_db.hashtag_barcodes"
        table_hashtags = "peer_lab_db.hashtags"
        query = f"""
        SELECT barcode_sequence, concat(substring(category, -1), barcode), 
        demultiplex_label, bp_shift FROM {table_hashtags} 
        LEFT JOIN {table_hashtag_barcodes} 
        ON {table_hashtag_barcodes}.id = {table_hashtags}.hashtagBarcodes_id 
        WHERE {table_hashtags}.sampleData_id = {sample_id}
        """
        result = execute_query(query, user, password)
        return result
    except Error as e:
        print(f"Error: {e}")

In [68]:
def get_bc_params(
    sample_id,
    user,
    password,
):
    bc_params = dict()

    # JSON of bc and UMI positions are stored in database
    # First check dense matrix exists before loading JSON from database
    bc_json = get_bc_json(sample_id, user, password)
    bc_pos = json.loads(bc_json)
    bc_params["cb"] = bc_pos["cellbarcode"]
    bc_params["umi"] = bc_params["cb"] + bc_pos["UMIs"]

    # Get bc sequence data from database
    bcs = get_bcs(sample_id, user, password)
    if not bcs:
        logging.warning(f"Barcodes data Empty:\n\t {db_connect.cur.statement}")
        return
    for bc in bcs:
        try:
            assert bc[0], "AssertionError: Missing sequence barcodes!"
            assert bc[1], "AssertionError: Missing barcode IDs"
        except AssertionError as err:
            logging.warning(f"{err}:\n\t {db_connect.cur.statement}")
            return

    barcodes = pd.DataFrame(bcs, columns=["sequence", "code", "label", "bp_shift"])
    conjugation = barcodes["code"].str.get(0)
    if conjugation.nunique() != 1:
        logging.warning(
            f"Sample has multiple hashtag barcode categories and will not be processed!"
        )
        return
    else:
        bc_params["conjugation"] = conjugation.values[0]

    if barcodes["bp_shift"].nunique() != 1:
        logging.warning(
            f"Sample {sample_id} has hashtag barcode categories, with bp-shift length/s "
            f"{barcodes['bp_shift'].unique()}, and will not be processed!"
        )
        return
    else: 
        bc_params["bp_shift"] = int(barcodes["bp_shift"][0])
        bc_params["seq_length"] = bc_params["bp_shift"] + barcodes["sequence"].apply(len).max()

    return bc_params

# Process Samples

## Setup

In [69]:
# Maps from .wdl name (prefix) to results dirname
results_dirs = {
    "Hashtag": "Hashtag-results",
    "CiteSeq": "CiteSeq-results",
}

# Maps from .wdl name (prefix) to shell script
sh_files = {
    "Hashtag": "submit-hashtag.sh",
    "CiteSeq": "submit-citeseq.sh",
}

# Maps from .wdl name (prefix) to pipeline name
pipeline_types = {
    "Hashtag": "Hashtag",
    "CiteSeq": "CITE-seq",
}

In [70]:
# Location of docker files
common_docker_registry = "quay.io/hisplan"

prefix = "CiteSeq" # Workflow(s) to run; also .wdl filename prefix(es)
pipeline_type = pipeline_types[prefix] # field in *.labels.json
output_dirname = results_dirs[prefix]

# If need to add comment, put here
comment = ""

In [71]:
# Sharp-specific parameters
options_prefix="Sharp"

In [72]:
# Locations of workflow-related directories and files
path_to_cromwell_secrets = f"{Path.home()}/.cromwell/cromwell-secrets.json" # CHANGE THIS
workflow_dir = f"{Path.home()}/scing/bin/sharp-0.1.1" # CHANGE THIS
path_to_exec = sh_files[prefix]
config_dir = f"{workflow_dir}/configs"
path_to_options = f"{workflow_dir}/{options_prefix}.options.aws.json"

# Other file locations
db_credentials_path = f"{Path.home()}/.config.json" # CHANGE THIS
barcodes_path = f"{Path.home()}/scing/data/barcodes" # CHANGE THIS

In [73]:
# Set credentials based on SCRIdb CLI config file
with open(db_credentials_path) as f:
    creds = json.load(f)

In [74]:
# Samples on which to run Sharp
# Notes: 
# - Assumes data is transferred to AWS s3 (this should be an s3 location)
# - Assumes directory name is also name of sample
# - Workflows above will be run on all samples below
# sample_paths = [
#     "s3://dp-lab-data/SCRI_Projects/HTAN_CITEseq/CI210127_CD45pos_citeseq_CITE",
# ]

In [75]:
%env AWS_ACCESS_KEY_ID=ASIAYPY66CWMKHJQGBB4
%env AWS_SECRET_ACCESS_KEY=0nbHN00aFVulHp4+YCSy0RvhgCGM727gYP3RGqSZ
%env AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEGgaCXVzLWVhc3QtMSJIMEYCIQDxy4jpyfEB/jM9OAPA8MaqUXoBwb85rfdmM2jnCMsbLQIhAK1wUjV40LfkU8YVMww7NEXIwHxZ0EgOHYguj2eMFfJxKvgBCMD//////////wEQAxoMNTgzNjQzNTY3NTEyIgyXMJrVvZpPwzJN68EqzAEvkprlhoEsf/wIOuRrDT2dxOc8apURXE7FxyGu21YIeQ1uWx70qKA6oWYjMzLvihBS5hc7h+LeDbfKH6tnA93f2L/X5gzbDkFjGZAETOMhvbkwuco3Ly7120Maf7BzxFhI2icfYAaMUUTMcazjA/Pvg6nGsAckCvDAjJYHCYqlCtsWBZp4h/6qKdoCUttuW1zBzMYhMLkaehJHVn0XJRT4km41FSjc+tUMwP/n6qLg9Wm8201qRdjQo2Cn8mUHzLaJ4doyXKlibIflvWgwq+a0lwY6lwG2EC5bGDs0YEb0+XMzFdKcct+fiPnnqgxZd/i3lUhLxv1imL6GP8Hu3toPJgkazFpoPN5+iCBOTavcSTnlKWSzX6+TGtJta85foMoS7+N9qTwem3MJBKZoW8I5kLcmBtMcqig6x3x+NZOHox6xrG/ddwngg1sEc5tzkozICNL9RqzW8HH8ZJqqQ6BQySfbKFoj3u5F0EBm
!aws s3 ls

env: AWS_ACCESS_KEY_ID=ASIAYPY66CWMKHJQGBB4
env: AWS_SECRET_ACCESS_KEY=0nbHN00aFVulHp4+YCSy0RvhgCGM727gYP3RGqSZ
env: AWS_SESSION_TOKEN=IQoJb3JpZ2luX2VjEGgaCXVzLWVhc3QtMSJIMEYCIQDxy4jpyfEB/jM9OAPA8MaqUXoBwb85rfdmM2jnCMsbLQIhAK1wUjV40LfkU8YVMww7NEXIwHxZ0EgOHYguj2eMFfJxKvgBCMD//////////wEQAxoMNTgzNjQzNTY3NTEyIgyXMJrVvZpPwzJN68EqzAEvkprlhoEsf/wIOuRrDT2dxOc8apURXE7FxyGu21YIeQ1uWx70qKA6oWYjMzLvihBS5hc7h+LeDbfKH6tnA93f2L/X5gzbDkFjGZAETOMhvbkwuco3Ly7120Maf7BzxFhI2icfYAaMUUTMcazjA/Pvg6nGsAckCvDAjJYHCYqlCtsWBZp4h/6qKdoCUttuW1zBzMYhMLkaehJHVn0XJRT4km41FSjc+tUMwP/n6qLg9Wm8201qRdjQo2Cn8mUHzLaJ4doyXKlibIflvWgwq+a0lwY6lwG2EC5bGDs0YEb0+XMzFdKcct+fiPnnqgxZd/i3lUhLxv1imL6GP8Hu3toPJgkazFpoPN5+iCBOTavcSTnlKWSzX6+TGtJta85foMoS7+N9qTwem3MJBKZoW8I5kLcmBtMcqig6x3x+NZOHox6xrG/ddwngg1sEc5tzkozICNL9RqzW8HH8ZJqqQ6BQySfbKFoj3u5F0EBm
2021-10-07 15:31:32 agc-583643567512-us-east-1
2021-10-07 15:28:07 cdktoolkit-stagingbucket-d49u1xfb0sc4
2019-10-09 13:04:06 cf-templates-umiwbnq3566w-us-east-1
2019-10-10 12:46:54 dp-

## Execution

### Method 1: From SCRIdb

In [78]:
# Common query col: id, request_id, Sample
def get_sample_name(query, query_col, creds):
    
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sample_data"
        query = f"""
        SELECT {table_sample_data}.Sample
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        sample_names = []
        results = execute_query(query, user, password)
        for result in results:
            sample_names.append(result[0])
        return sample_names
    except Error as e:
        print(f"Error: {e}")
    
    
def get_aws_path(query, query_col, creds):
    
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sample_data"
        query = f"""
        SELECT {table_sample_data}.AWS_storage
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        sample_paths = []
        results = execute_query(query, user, password)
        for result in results:
            sample_paths.append(result[0])
        return sample_paths
    except Error as e:
        print(f"Error: {e}")
        
            
def get_sample_id(query, query_col, creds):
    
    user = creds['user']
    password = creds['password']
    
    try:
        table_sample_data = "peer_lab_db.sample_data"
        query = f"""
        SELECT {table_sample_data}.id
        FROM {table_sample_data}
        WHERE {table_sample_data}.{query_col}="{query}"
        """
        sample_ids = []
        results = execute_query(query, user, password)
        for result in results:
            sample_ids.append(result[0])
        return sample_ids
    except Error as e:
        print(f"Error: {e}")
        
def format_sample_aws(querys, query_col, creds):
    sample_names = []
    sample_paths = []
    sample_ids = []
    
    for query in querys:
        sample_names += get_sample_name(query, query_col, creds)
        sample_paths += get_aws_path(query, query_col, creds)
        sample_ids += get_sample_id(query, query_col, creds)
        
    sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists
    
    samples = pd.DataFrame(
        dict(S3_path=sample_paths, Sample_ID=sample_ids),
        index=sample_names,
        dtype=str,
    )
    return samples

In [34]:
# Common query col: id, request_id, Sample
sample_id = list(range(3947, 3952))

samples = format_sample_aws(sample_id, 'id', creds)
samples

Unnamed: 0,S3_path,Sample_ID
SV-1721_SV_LN11,s3://dp-lab-data/collaborators/vardh...,3947
SV-1721_SV_LN12,s3://dp-lab-data/collaborators/vardh...,3948
SV-1723_SV_LN1,s3://dp-lab-data/collaborators/vardh...,3949
SV-1723_SV_LN2,s3://dp-lab-data/collaborators/vardh...,3950
SV-1723_SV_LN3,s3://dp-lab-data/collaborators/vardh...,3951


In [35]:
samples = samples.rename(columns={'S3_path': 'S3_Path'})
if prefix == 'Hashtag':
    samples['S3_Path'] += '_HTO'

In [36]:
# # Get information for all samples
# sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists
# sample_names = [os.path.basename(s) for s in sample_paths]
# sample_names = [re.match(r'(.*)_.+$', s)[1] for s in sample_names] # remove library suffix (e.g. _CITE, _HTO, etc.)
# # TODO: assert basename is in peer_lab_db.sample_data.Sample
# # assert(all(check_sample_name(s) for s in sample_names))
# samples = pd.DataFrame(
#     sample_paths,
#     index=sample_names,
#     columns=["S3_Path"],
#     dtype=str,
# )
# Get FASTQ paths from S3
# Note: Uses same FASTQ file ids for all samples
fastq_file_ids = fastq_map[prefix]
samples["FASTQs"] = samples["S3_Path"].apply(lambda x: get_fastqs(x, fastq_file_ids))

# samples["Sample_ID"] = pd.Series(samples.index).apply(
#     lambda x: get_sample_id(x, creds['user'], creds['password'])
# ).values
samples["Whitelist_Params"] = samples['Sample_ID'].apply(
    lambda x: get_wl_params(x, creds['user'], creds['password'])
)
samples["Barcode_Params"] = samples['Sample_ID'].apply(
    lambda x: get_bc_params(x, creds['user'], creds['password'])
)
samples["Barcodes"] = samples['Sample_ID'].apply(
    lambda x: get_bcs(x, creds['user'], creds['password'])
)

samples

Unnamed: 0,S3_Path,Sample_ID,FASTQs,Whitelist_Params,Barcode_Params,Barcodes
SV-1721_SV_LN11,s3://dp-lab-data/collaborators/vardh...,3947,{'R1': ['s3://dp-lab-data/collaborat...,{'uri': 's3://dp-lab-data/collaborat...,"{'cb': 16, 'umi': 28, 'conjugation':...","[(GTCAACTCTTTAGCG, C0251, Pt19_Tumor..."
SV-1721_SV_LN12,s3://dp-lab-data/collaborators/vardh...,3948,{'R1': ['s3://dp-lab-data/collaborat...,{'uri': 's3://dp-lab-data/collaborat...,"{'cb': 16, 'umi': 28, 'conjugation':...","[(GTCAACTCTTTAGCG, C0251, Pt60_LN, 1..."
SV-1723_SV_LN1,s3://dp-lab-data/collaborators/vardh...,3949,{'R1': ['s3://dp-lab-data/collaborat...,{'uri': 's3://dp-lab-data/collaborat...,"{'cb': 16, 'umi': 28, 'conjugation':...","[(GTCAACTCTTTAGCG, C0251, Pt37_A, 10..."
SV-1723_SV_LN2,s3://dp-lab-data/collaborators/vardh...,3950,{'R1': ['s3://dp-lab-data/collaborat...,{'uri': 's3://dp-lab-data/collaborat...,"{'cb': 16, 'umi': 28, 'conjugation':...","[(GTCAACTCTTTAGCG, C0251, Pt33_B, 10..."
SV-1723_SV_LN3,s3://dp-lab-data/collaborators/vardh...,3951,{'R1': ['s3://dp-lab-data/collaborat...,{'uri': 's3://dp-lab-data/collaborat...,"{'cb': 16, 'umi': 28, 'conjugation':...","[(GTCAACTCTTTAGCG, C0251, Pt42_A, 10..."


In [37]:
# Function to reformat barcode labels for Sharp
def reformat_bc_label(label):
    label = label.encode('ascii', 'namereplace').decode()
    label = label.replace("\\N", "").replace(" ", "_")
    return label

for name, sample in tqdm(samples.iterrows(), total=len(samples)):
    
    # Reformat barcodes
    barcodes = pd.DataFrame(
        sample["Barcodes"],
        columns=["sequence", "code", "label", "bp_shift"]
    )
    barcodes["label"] = barcodes["label"].apply(reformat_bc_label)
    
    # Save to CSV
    path_to_csv = f"{barcodes_path}/{sample['Sample_ID']}_tag-list.csv"
    barcodes.to_csv(path_to_csv, header=False, index=False)
    
    # Upload tag-list to AWS
    cmd = f"aws s3 cp {path_to_csv} {sample['S3_Path']}/{output_dirname}/tag-list.csv"
    var = subprocess.run(shlex.split(cmd), universal_newlines=True, capture_output=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [38]:
# Load minimum inputs and labels fields from templates
conjugation = samples["Barcode_Params"].apply(lambda x: x["conjugation"]).values[0] # conjugation must be same for all samples
template_prefix = get_template(prefix, conjugation)
with open(f"{config_dir}/{template_prefix}.inputs.json") as f:
    std_inputs_fields = list(json.load(f).keys())
    
with open(f"{config_dir}/{template_prefix}.labels.json") as f:
    std_labels_fields = list(json.load(f).keys())
    
# Annotate all samples with workflow inputs and labels
inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)
labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)

platform = "10x"

In [39]:
# Annotate inputs
inputs[f"{prefix}.sampleName"] = inputs.index
inputs[f"{prefix}.scRnaSeqPlatform"] = platform # may need to change
inputs[f"{prefix}.lengthR1"] = samples["Barcode_Params"].apply(lambda x: x["umi"])
inputs[f"{prefix}.lengthR2"] = samples["Barcode_Params"].apply(lambda x: x["seq_length"])
inputs[f"{prefix}.cellBarcodeWhitelistUri"] = samples["Whitelist_Params"].apply(lambda x: x["uri"])
inputs[f"{prefix}.cellBarcodeWhiteListMethod"] = samples["Whitelist_Params"].apply(lambda x: x["method"])
inputs[f"{prefix}.{'tagList' if (prefix=='CiteSeq') else 'hashTagList'}"] = \
    samples["S3_Path"] + f"/{output_dirname}/tag-list.csv" #TODO: Ask about changing all to 'tagList'
inputs[f"{prefix}.cbStartPos"] = 1
inputs[f"{prefix}.cbEndPos"] = samples["Barcode_Params"].apply(lambda x: x["cb"])
inputs[f"{prefix}.umiStartPos"] = inputs[f"{prefix}.cbEndPos"]+1
inputs[f"{prefix}.umiEndPos"] = samples["Barcode_Params"].apply(lambda x: x["umi"])
inputs[f"{prefix}.trimPos"] = samples["Barcode_Params"].apply(lambda x: x["bp_shift"])
inputs[f"{prefix}.translate10XBarcodes"] = \
    (inputs[f"{prefix}.scRnaSeqPlatform"] == "10x_v3") & \
    (samples["Barcode_Params"].apply(lambda x: x["conjugation"] == "B"))
inputs[f"{prefix}.dockerRegistry"] = common_docker_registry
for file_id in fastq_file_ids: # Set FASTQs
    inputs[f"{prefix}.uriFastq{file_id}"] = samples["FASTQs"].apply(lambda x: x[file_id])

# ********************
# Defaults
# Note: These may need to be changed on a per-sample or per-execution basis

inputs[f"{prefix}.slidingWindowSearch"] = False
inputs[f"{prefix}.cbCollapsingDistance"] = 1
inputs[f"{prefix}.umiCollapsingDistance"] = 1
inputs[f"{prefix}.numExpectedCells"] = 0
# Need trick to set dictionary for each row
common_resource_spec = {
    "cpu": 32,
    "memory": -1,
}
inputs[f"{prefix}.resourceSpec"] = inputs.iloc[:, 0].apply(lambda x: common_resource_spec)
if prefix == "Hashtag":
    inputs[f"{prefix}.minCount"] = 10

# ********************

# Annotate labels
labels["pipelineType"] = pipeline_type
labels["project"] = samples["Sample_ID"].apply(lambda x: get_project_id(x, creds["user"], creds["password"]))
labels["sample"] = labels.index
labels["owner"] = creds["user"]
labels["destination"] = samples['S3_Path'] + "/" + output_dirname
labels["transfer"] = "-"
labels["comment"] = creds["user"]

assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())
assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())

In [40]:
inputs

Unnamed: 0,Hashtag.uriFastqR1,Hashtag.uriFastqR2,Hashtag.sampleName,Hashtag.scRnaSeqPlatform,Hashtag.lengthR1,Hashtag.lengthR2,Hashtag.cellBarcodeWhitelistUri,Hashtag.cellBarcodeWhiteListMethod,Hashtag.hashTagList,Hashtag.cbStartPos,...,Hashtag.umiEndPos,Hashtag.trimPos,Hashtag.slidingWindowSearch,Hashtag.translate10XBarcodes,Hashtag.cbCollapsingDistance,Hashtag.umiCollapsingDistance,Hashtag.numExpectedCells,Hashtag.minCount,Hashtag.resourceSpec,Hashtag.dockerRegistry
SV-1721_SV_LN11,[s3://dp-lab-data/collaborators/vard...,[s3://dp-lab-data/collaborators/vard...,SV-1721_SV_LN11,10x,28,25,s3://dp-lab-data/collaborators/vardh...,10x,s3://dp-lab-data/collaborators/vardh...,1,...,28,10,False,False,1,1,0,10,"{'cpu': 32, 'memory': -1}",quay.io/hisplan
SV-1721_SV_LN12,[s3://dp-lab-data/collaborators/vard...,[s3://dp-lab-data/collaborators/vard...,SV-1721_SV_LN12,10x,28,25,s3://dp-lab-data/collaborators/vardh...,10x,s3://dp-lab-data/collaborators/vardh...,1,...,28,10,False,False,1,1,0,10,"{'cpu': 32, 'memory': -1}",quay.io/hisplan
SV-1723_SV_LN1,[s3://dp-lab-data/collaborators/vard...,[s3://dp-lab-data/collaborators/vard...,SV-1723_SV_LN1,10x,28,25,s3://dp-lab-data/collaborators/vardh...,10x,s3://dp-lab-data/collaborators/vardh...,1,...,28,10,False,False,1,1,0,10,"{'cpu': 32, 'memory': -1}",quay.io/hisplan
SV-1723_SV_LN2,[s3://dp-lab-data/collaborators/vard...,[s3://dp-lab-data/collaborators/vard...,SV-1723_SV_LN2,10x,28,25,s3://dp-lab-data/collaborators/vardh...,10x,s3://dp-lab-data/collaborators/vardh...,1,...,28,10,False,False,1,1,0,10,"{'cpu': 32, 'memory': -1}",quay.io/hisplan
SV-1723_SV_LN3,[s3://dp-lab-data/collaborators/vard...,[s3://dp-lab-data/collaborators/vard...,SV-1723_SV_LN3,10x,28,25,s3://dp-lab-data/collaborators/vardh...,10x,s3://dp-lab-data/collaborators/vardh...,1,...,28,10,False,False,1,1,0,10,"{'cpu': 32, 'memory': -1}",quay.io/hisplan


In [41]:
labels

Unnamed: 0,pipelineType,project,sample,owner,destination,transfer,comment
SV-1721_SV_LN11,Hashtag,Locally advanced gastric cancer,SV-1721_SV_LN11,sohailn,s3://dp-lab-data/collaborators/vardh...,-,sohailn
SV-1721_SV_LN12,Hashtag,Locally advanced gastric cancer,SV-1721_SV_LN12,sohailn,s3://dp-lab-data/collaborators/vardh...,-,sohailn
SV-1723_SV_LN1,Hashtag,single cell immune profiling of PBMC...,SV-1723_SV_LN1,sohailn,s3://dp-lab-data/collaborators/vardh...,-,sohailn
SV-1723_SV_LN2,Hashtag,single cell immune profiling of PBMC...,SV-1723_SV_LN2,sohailn,s3://dp-lab-data/collaborators/vardh...,-,sohailn
SV-1723_SV_LN3,Hashtag,single cell immune profiling of PBMC...,SV-1723_SV_LN3,sohailn,s3://dp-lab-data/collaborators/vardh...,-,sohailn


In [43]:
labels['destination'].tolist()

['s3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/Hashtag-results',
 's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN12_HTO/Hashtag-results',
 's3://dp-lab-data/collaborators/vardhans/scImmuneProfilingPbmcsNAcTreatedCovidPts/SV-1723_SV_LN1_HTO/Hashtag-results',
 's3://dp-lab-data/collaborators/vardhans/scImmuneProfilingPbmcsNAcTreatedCovidPts/SV-1723_SV_LN2_HTO/Hashtag-results',
 's3://dp-lab-data/collaborators/vardhans/scImmuneProfilingPbmcsNAcTreatedCovidPts/SV-1723_SV_LN3_HTO/Hashtag-results']

In [58]:
stdouts = [] # to store all outputs
process = True
import time

with tqdm(samples.index) as t:

    for sample_name in t:

        # Write inputs and labels to file
        path_to_inputs = f"{config_dir}/{sample_name}_{prefix}.inputs.json"
        with open(path_to_inputs, "w") as f_inputs:
            json.dump(inputs.loc[sample_name].to_dict(), f_inputs, indent=4, cls=NpEncoder)

        path_to_labels = f"{config_dir}/{sample_name}_{prefix}.labels.json"
        with open(path_to_labels, "w") as f_labels:
            json.dump(labels.loc[sample_name].to_dict(), f_labels, indent=4, cls=NpEncoder)

        if process:
            stdouts.append(run(
                workflow_path = workflow_dir,
                execp = path_to_exec,
                secrets = path_to_cromwell_secrets,
                inputs = path_to_inputs,
                labels = path_to_labels,
                options = path_to_options,
            ))
        time.sleep(20)

  0%|          | 0/5 [00:00<?, ?it/s]

In [59]:
stdouts

[{'args': ['/Users/sohailn/scing/bin/sharp-0.1.1/submit-hashtag.sh',
   '-k',
   '/Users/sohailn/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/sohailn/scing/bin/sharp-0.1.1/configs/SV-1721_SV_LN11_Hashtag.inputs.json',
   '-l',
   '/Users/sohailn/scing/bin/sharp-0.1.1/configs/SV-1721_SV_LN11_Hashtag.labels.json',
   '-o',
   '/Users/sohailn/scing/bin/sharp-0.1.1/Sharp.options.aws.json'],
  'returncode': 0,
  'stdout': '{"id":"72d6f6f0-b149-47f8-9017-cc0ee3ff98d7","status":"Submitted"}\n',
  'stderr': ''},
 {'args': ['/Users/sohailn/scing/bin/sharp-0.1.1/submit-hashtag.sh',
   '-k',
   '/Users/sohailn/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/sohailn/scing/bin/sharp-0.1.1/configs/SV-1721_SV_LN12_Hashtag.inputs.json',
   '-l',
   '/Users/sohailn/scing/bin/sharp-0.1.1/configs/SV-1721_SV_LN12_Hashtag.labels.json',
   '-o',
   '/Users/sohailn/scing/bin/sharp-0.1.1/Sharp.options.aws.json'],
  'returncode': 0,
  'stdout': '{"id":"8c2f2354-c7a7-44aa-a776-2f315ca4f297","stat

### Method 2: From Excel File

In [100]:
# Manual inputs
platform = "10x_v3"
#path_to_excel = f"{Path.home()}/scing/barcodes/JR-1217_3080_Barcodes.xlsx" 
# path_to_excel = f"{Path.home()}/scing/barcodes/3213_IM-1356.xlsx"
path_to_excel = f'~/Downloads/SV-1721 CITEseq.xlsx'
# path_to_excel = f'~/Downloads/SV-1723 CITEseq.xlsx'

In [101]:
# Common query col: id, request_id, Sample
sample_id = list(range(3947, 3952))

samples = format_sample_aws(sample_id, 'id', creds)
samples

Unnamed: 0,S3_path,Sample_ID
SV-1721_SV_LN11,s3://dp-lab-data/collaborators/vardh...,3947
SV-1721_SV_LN12,s3://dp-lab-data/collaborators/vardh...,3948
SV-1723_SV_LN1,s3://dp-lab-data/collaborators/vardh...,3949
SV-1723_SV_LN2,s3://dp-lab-data/collaborators/vardh...,3950
SV-1723_SV_LN3,s3://dp-lab-data/collaborators/vardh...,3951


In [102]:
samples = samples.rename(columns={'S3_path': 'S3_Path'})
samples['S3_Path'] += '_HTO'
# if prefix == 'Hashtag':
#     samples['S3_Path'] += '_HTO'
    
# elif prefix == 'CiteSEq':
#     samples['S3_Path'] += '_CITE'

In [103]:
samples = samples.loc[samples.index.str.startswith('SV-1721')]
# samples = samples.loc[samples.index.str.startswith('SV-1723')]

samples

Unnamed: 0,S3_Path,Sample_ID
SV-1721_SV_LN11,s3://dp-lab-data/collaborators/vardh...,3947
SV-1721_SV_LN12,s3://dp-lab-data/collaborators/vardh...,3948


In [34]:
# # Get information for all samples
# # sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists
# # sample_names = [os.path.basename(s) for s in sample_paths]
# # sample_names = [re.match(r'(.*)_.+$', s)[1] for s in sample_names] # remove library suffix (e.g. _CITE, _HTO, etc.)
# # TODO: assert basename is in peer_lab_db.sample_data.Sample
# # assert(all(check_sample_name(s) for s in sample_names))
# samples = pd.DataFrame(
#     sample_paths,
#     index=sample_names,
#     columns=["S3_Path"],
#     dtype=str,
# )
# samples["Sample_ID"] = pd.Series(samples.index).apply(
#     lambda x: get_sample_id(x, creds['user'], creds['password'])
# ).values

In [82]:
# Read barcodes from file
# Note: Must be subset to HTO or CITE barcodes before next step!
# barcodes = get_bcs_manual(path_to_excel)
# barcodes = barcodes[
#     barcodes["Description"].str.contains("SS1") |
#     barcodes["Description"].str.contains("SS2")
# ]

In [104]:
barcodes = pd.read_excel(path_to_excel, header=None) 
print(path_to_excel)
barcodes.columns = ['Description', 'Barcode', 'DNA_ID', 'description', 'citeseq']
replace = lambda x: x.encode('ascii', 'namereplace').decode().replace("\\N", "")
barcodes["Description"] = barcodes["Description"].apply(replace)
barcodes["BP Shift"] = 0
barcodes = barcodes[["Barcode", "DNA_ID", "Description", "BP Shift"]]
barcodes

~/Downloads/SV-1721 CITEseq.xlsx


Unnamed: 0,Barcode,DNA_ID,Description,BP Shift
0,GTCTTTGTCAGTGCA,C0006,anti-human CD86,0
1,GTTGTCCGACAATAC,C0007,"anti-human CD274 (B7-H1, PD-L1)",0
2,TGATAGAAACAGACC,C0020,"anti-human CD270 (HVEM, TR2)",0
3,ATCACATCGTTGCCA,C0023,anti-human CD155 (PVR),0
4,AACCTTCCGTCTAAG,C0024,anti-human CD112 (Nectin-2),0
...,...,...,...,...
132,GAGTCGAGAAATCAT,C0918,anti-human HLA-E,0
133,TCCCACTTCCGCTTT,C0920,anti-human CD82,0
134,CTACTTCCCTGTCAA,C0944,anti-human CD101 (BB27),0
135,GCCGCATGAGAAACA,C1046,anti-human CD88 (C5aR),0


In [105]:
# Upload barcodes for all samples
# Note: Uploads same barcodes to s3 for all samples
path_to_csv = path_to_excel.replace(".xlsx", ".csv")
path_to_csv = path_to_csv.replace(' ', '_')
barcodes.to_csv(path_to_csv, header=False, index=False)
for s3_path in tqdm(samples["S3_Path"]):
    cmd = f"aws s3 cp {path_to_csv} {s3_path}/{output_dirname}/tag-list.csv"
    print(cmd)
    var = subprocess.run(shlex.split(cmd), universal_newlines=True, capture_output=True)

  0%|          | 0/2 [00:00<?, ?it/s]

aws s3 cp ~/Downloads/SV-1721_CITEseq.csv s3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/CiteSeq-results/tag-list.csv
aws s3 cp ~/Downloads/SV-1721_CITEseq.csv s3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN12_HTO/CiteSeq-results/tag-list.csv


In [106]:
# Note: Assumes GEX data is recorded in database
samples["Whitelist_Params"] = samples["Sample_ID"].apply(
    lambda x: get_wl_params(x, creds['user'], creds['password'])
)
assert ~samples["Whitelist_Params"].isna().any()
samples

Unnamed: 0,S3_Path,Sample_ID,Whitelist_Params
SV-1721_SV_LN11,s3://dp-lab-data/collaborators/vardh...,3947,{'uri': 's3://dp-lab-data/collaborat...
SV-1721_SV_LN12,s3://dp-lab-data/collaborators/vardh...,3948,{'uri': 's3://dp-lab-data/collaborat...


In [107]:
# Note: Adds same barcode parameters for all samples
samples["BC_Params"] = samples["Sample_ID"].apply(
    lambda x: get_bc_params_manual(barcodes, prefix, platform, creds)
)
assert ~samples["BC_Params"].isna().any()
samples

Unnamed: 0,S3_Path,Sample_ID,Whitelist_Params,BC_Params
SV-1721_SV_LN11,s3://dp-lab-data/collaborators/vardh...,3947,{'uri': 's3://dp-lab-data/collaborat...,"{'conjugation': 'C', 'bp_shift': 10,..."
SV-1721_SV_LN12,s3://dp-lab-data/collaborators/vardh...,3948,{'uri': 's3://dp-lab-data/collaborat...,"{'conjugation': 'C', 'bp_shift': 10,..."


In [108]:
# Get FASTQ paths from S3
# Note: Uses same FASTQ file ids for all samples
fastq_file_ids = fastq_map[prefix]
samples["FASTQs"] = samples["S3_Path"].apply(lambda x: get_fastqs(x, fastq_file_ids))
samples

Unnamed: 0,S3_Path,Sample_ID,Whitelist_Params,BC_Params,FASTQs
SV-1721_SV_LN11,s3://dp-lab-data/collaborators/vardh...,3947,{'uri': 's3://dp-lab-data/collaborat...,"{'conjugation': 'C', 'bp_shift': 10,...",{'R1': ['s3://dp-lab-data/collaborat...
SV-1721_SV_LN12,s3://dp-lab-data/collaborators/vardh...,3948,{'uri': 's3://dp-lab-data/collaborat...,"{'conjugation': 'C', 'bp_shift': 10,...",{'R1': ['s3://dp-lab-data/collaborat...


In [109]:
samples.iloc[0]['FASTQs']

{'R1': ['s3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L001_R1_001.fastq.gz',
  's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L002_R1_001.fastq.gz',
  's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L003_R1_001.fastq.gz',
  's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L004_R1_001.fastq.gz'],
 'R2': ['s3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L001_R2_001.fastq.gz',
  's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/FASTQ/3947_SV-1721_SV_LN11_HTO_IGO_12437_AS_7_S25_L002_R2_001.fastq.gz

In [110]:
# Load minimum inputs and labels fields from templates
conjugation = samples["BC_Params"].apply(lambda x: x["conjugation"]).values[0] # conjugation must be same for all samples
template_prefix = get_template(prefix, conjugation)
with open(f"{config_dir}/{template_prefix}.inputs.json") as f:
    std_inputs_fields = list(json.load(f).keys())
    
with open(f"{config_dir}/{template_prefix}.labels.json") as f:
    std_labels_fields = list(json.load(f).keys())
    
# Annotate all samples with workflow inputs and labels
inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)
labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)

In [111]:
# Annotate inputs
inputs[f"{prefix}.sampleName"] = inputs.index
inputs[f"{prefix}.scRnaSeqPlatform"] = platform # may need to change
inputs[f"{prefix}.lengthR1"] = samples["BC_Params"].apply(lambda x: x["umi"])
inputs[f"{prefix}.lengthR2"] = samples["BC_Params"].apply(lambda x: x["seq_length"])
inputs[f"{prefix}.cellBarcodeWhitelistUri"] = samples["Whitelist_Params"].apply(lambda x: x["uri"])
inputs[f"{prefix}.cellBarcodeWhiteListMethod"] = samples["Whitelist_Params"].apply(lambda x: x["method"])
inputs[f"{prefix}.{'tagList' if (prefix=='CiteSeq') else 'hashTagList'}"] = \
    samples["S3_Path"] + f"/{output_dirname}/tag-list.csv" #TODO: Ask about changing all to 'tagList'
inputs[f"{prefix}.cbStartPos"] = 1
inputs[f"{prefix}.cbEndPos"] = samples["BC_Params"].apply(lambda x: x["cb"])
inputs[f"{prefix}.umiStartPos"] = inputs[f"{prefix}.cbEndPos"]+1
inputs[f"{prefix}.umiEndPos"] = samples["BC_Params"].apply(lambda x: x["umi"])
inputs[f"{prefix}.trimPos"] = samples["BC_Params"].apply(lambda x: x["bp_shift"])
inputs[f"{prefix}.translate10XBarcodes"] = \
    (inputs[f"{prefix}.scRnaSeqPlatform"] == "10x_v3") & \
    (samples["BC_Params"].apply(lambda x: x["conjugation"] == "B"))
inputs[f"{prefix}.dockerRegistry"] = common_docker_registry
for file_id in fastq_file_ids: # Set FASTQs
    inputs[f"{prefix}.uriFastq{file_id}"] = samples["FASTQs"].apply(lambda x: x[file_id])

# ********************
# Defaults
# Note: These may need to be changed on a per-sample or per-execution basis

inputs[f"{prefix}.slidingWindowSearch"] = False
inputs[f"{prefix}.cbCollapsingDistance"] = 1
inputs[f"{prefix}.umiCollapsingDistance"] = 1
inputs[f"{prefix}.numExpectedCells"] = 0
# Need trick to set dictionary for each row
common_resource_spec = {
    "cpu": 32,
    "memory": -1,
}
inputs[f"{prefix}.resourceSpec"] = inputs.iloc[:, 0].apply(lambda x: common_resource_spec)
if prefix == "Hashtag":
    inputs[f"{prefix}.minCount"] = 10

# ********************

# Annotate labels
labels["pipelineType"] = pipeline_type
labels["project"] = samples["Sample_ID"].apply(lambda x: get_project_id(x, creds["user"], creds["password"]))
labels["sample"] = labels.index
labels["owner"] = creds["user"]
labels["destination"] = samples['S3_Path'] + "/" + output_dirname
labels["transfer"] = "-"
labels["comment"] = creds["user"]

assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())
assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())

In [112]:
inputs

Unnamed: 0,CiteSeq.uriFastqR1,CiteSeq.uriFastqR2,CiteSeq.sampleName,CiteSeq.scRnaSeqPlatform,CiteSeq.lengthR1,CiteSeq.lengthR2,CiteSeq.cellBarcodeWhitelistUri,CiteSeq.cellBarcodeWhiteListMethod,CiteSeq.tagList,CiteSeq.cbStartPos,...,CiteSeq.umiStartPos,CiteSeq.umiEndPos,CiteSeq.trimPos,CiteSeq.slidingWindowSearch,CiteSeq.translate10XBarcodes,CiteSeq.cbCollapsingDistance,CiteSeq.umiCollapsingDistance,CiteSeq.numExpectedCells,CiteSeq.resourceSpec,CiteSeq.dockerRegistry
SV-1721_SV_LN11,[s3://dp-lab-data/collaborators/vard...,[s3://dp-lab-data/collaborators/vard...,SV-1721_SV_LN11,10x_v3,28,25,s3://dp-lab-data/collaborators/vardh...,10x,s3://dp-lab-data/collaborators/vardh...,1,...,17,28,10,False,False,1,1,0,"{'cpu': 32, 'memory': -1}",quay.io/hisplan
SV-1721_SV_LN12,[s3://dp-lab-data/collaborators/vard...,[s3://dp-lab-data/collaborators/vard...,SV-1721_SV_LN12,10x_v3,28,25,s3://dp-lab-data/collaborators/vardh...,10x,s3://dp-lab-data/collaborators/vardh...,1,...,17,28,10,False,False,1,1,0,"{'cpu': 32, 'memory': -1}",quay.io/hisplan


In [113]:
labels

Unnamed: 0,pipelineType,project,sample,owner,destination,transfer,comment
SV-1721_SV_LN11,CITE-seq,Locally advanced gastric cancer,SV-1721_SV_LN11,sohailn,s3://dp-lab-data/collaborators/vardh...,-,sohailn
SV-1721_SV_LN12,CITE-seq,Locally advanced gastric cancer,SV-1721_SV_LN12,sohailn,s3://dp-lab-data/collaborators/vardh...,-,sohailn


In [115]:
import time
stdouts = [] # to store all outputs
process = True

with tqdm(samples.index) as t:

    for sample_name in t:

        # Write inputs and labels to file
        path_to_inputs = f"{config_dir}/{sample_name}_{prefix}.inputs.json"
        with open(path_to_inputs, "w") as f_inputs:
            json.dump(inputs.loc[sample_name].to_dict(), f_inputs, indent=4, cls=NpEncoder)

        path_to_labels = f"{config_dir}/{sample_name}_{prefix}.labels.json"
        with open(path_to_labels, "w") as f_labels:
            json.dump(labels.loc[sample_name].to_dict(), f_labels, indent=4, cls=NpEncoder)

        if process:
            stdouts.append(run(
                workflow_path = workflow_dir,
                execp = path_to_exec,
                secrets = path_to_cromwell_secrets,
                inputs = path_to_inputs,
                labels = path_to_labels,
                options = path_to_options,
            ))
            
        time.sleep(20)

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
path_to_options

In [99]:
stdouts

[{'args': ['/Users/sohailn/scing/bin/sharp-0.1.1/submit-citeseq.sh',
   '-k',
   '/Users/sohailn/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/sohailn/scing/bin/sharp-0.1.1/configs/SV-1723_SV_LN1_CiteSeq.inputs.json',
   '-l',
   '/Users/sohailn/scing/bin/sharp-0.1.1/configs/SV-1723_SV_LN1_CiteSeq.labels.json',
   '-o',
   '/Users/sohailn/scing/bin/sharp-0.1.1/Sharp.options.aws.json'],
  'returncode': 0,
  'stdout': '{"id":"9f17b923-4124-422c-ab30-35c1d30e3261","status":"Submitted"}\n',
  'stderr': ''},
 {'args': ['/Users/sohailn/scing/bin/sharp-0.1.1/submit-citeseq.sh',
   '-k',
   '/Users/sohailn/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/sohailn/scing/bin/sharp-0.1.1/configs/SV-1723_SV_LN2_CiteSeq.inputs.json',
   '-l',
   '/Users/sohailn/scing/bin/sharp-0.1.1/configs/SV-1723_SV_LN2_CiteSeq.labels.json',
   '-o',
   '/Users/sohailn/scing/bin/sharp-0.1.1/Sharp.options.aws.json'],
  'returncode': 0,
  'stdout': '{"id":"980049ce-7178-4e37-97ab-2dd2ef54a844","status":

In [97]:
labels['destination'].tolist()

['s3://dp-lab-data/collaborators/vardhans/scImmuneProfilingPbmcsNAcTreatedCovidPts/SV-1723_SV_LN1_HTO/CiteSeq-results',
 's3://dp-lab-data/collaborators/vardhans/scImmuneProfilingPbmcsNAcTreatedCovidPts/SV-1723_SV_LN2_HTO/CiteSeq-results',
 's3://dp-lab-data/collaborators/vardhans/scImmuneProfilingPbmcsNAcTreatedCovidPts/SV-1723_SV_LN3_HTO/CiteSeq-results']

In [114]:
labels['destination'].tolist()

['s3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN11_HTO/CiteSeq-results',
 's3://dp-lab-data/collaborators/vardhans/LocallyAdvancedGastricCancer/SV-1721_SV_LN12_HTO/CiteSeq-results']