In [1]:
import re, subprocess, boto3, json, shlex, mysql, os
import pandas as pd
from s3path import S3Path
from pathlib import Path
from tqdm.notebook import tqdm

# Define Helper Functions

In [2]:
# from SCRIdb
def get_s3_objects(bucket, key, pattern, full_uri=False):
    
    s3r = boto3.resource("s3")
    bucket_s3 = s3r.Bucket(bucket)
    objects = []
    for obj in bucket_s3.objects.filter(Prefix=key):
        hit = pattern.search(obj.key)
        if hit:
            objects.append(obj.key)
    if full_uri:
        objects = [f"s3://{bucket}/{o}" for o in objects]
    return objects

In [3]:
def get_reference(species):
    if "Human" in species:
        return "https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-GRCh38-2020-A.tar.gz"
    
    elif "Mouse" in species:
        return "https://cf.10xgenomics.com/supp/cell-arc/refdata-cellranger-arc-mm10-2020-A-2.0.0.tar.gz"
    else:
        raise ValueError(f"Unknown Species: {species}")

In [4]:
def execute_query(query, user, password):
    with connect(
        host="peer-lab-db.cggxmlwgzzpw.us-east-1.rds.amazonaws.com",
        database="peer_lab_db",
        user=user,
        password=password,
    ) as connection:
        with connection.cursor(buffered=True) as cursor:
            cursor.execute(query)
            result = cursor.fetchall()
    return result

In [5]:
# Get species from database for given sample
from mysql.connector import connect, Error

def get_species(sample_id, user, password):
    try:
        table_sample_data = "peer_lab_db.sample_data"
        table_species = "peer_lab_db.species"
        table_genome_idx = "peer_lab_db.genome_index"
        query = f"""
        SELECT {table_species}.Species
        FROM {table_species}
        LEFT JOIN {table_genome_idx}
        ON {table_species}.id = {table_genome_idx}.species_id
        LEFT JOIN {table_sample_data}
        ON {table_genome_idx}.id = {table_sample_data}.genomeIndex_id
        WHERE {table_sample_data}.id = {sample_id}
        """
        result = execute_query(query, user, password)[0][0]
        return result
    except Error as e:
        print(f"Error: {e}")

In [6]:
# Get species from database for given sample
from mysql.connector import connect, Error

def get_project_id(sample_id, user, password):
    try:
        table_sample_data = "peer_lab_db.sample_data"
        table_project_data = "peer_lab_db.project_data"
        query = f"""
        SELECT {table_project_data}.projectName
        FROM {table_project_data}
        LEFT JOIN {table_sample_data}
        ON {table_project_data}.id = {table_sample_data}.projectData_id
        WHERE {table_sample_data}.id = {sample_id}
        """
        result = execute_query(query, user, password)[0][0]
        return result
    except Error as e:
        print(f"Error: {e}")

In [7]:
def run(
    workflow_path: str,
    execp: str,
    secrets: str,
    inputs: str,
    labels: str,
    options: str,
):
    # change working directory to the pipeline package
    oldwd = os.getcwd()
    os.chdir(workflow_path)
    
    # execute the pipeline command
    cmd = f"{workflow_path}/{execp} -k {secrets} -i {inputs} -l {labels} -o {options}"
    var = subprocess.run(shlex.split(cmd), universal_newlines=True, capture_output=True)
    out = var.__dict__
    
    # change working directory back
    os.chdir(oldwd)
    
    return out

# Process Samples

## Setup

In [8]:
# Location of docker files
common_docker_registry = "quay.io/hisplan"

# Workflow to run; also .wdl filename prefix
prefix = "CellRangerArc"
pipeline_type = "CellRangerARC"
output_dirname = "cr-arc-results"

# Reads needed for each library relevant to this workflow
GEX_reads = ["I1", "I2", "R1", "R2"]
ATAC_reads = ["I1", "R1", "R2", "R3"]

# If need to add comment, put here
comment = ""

In [9]:
# Locations of workflow-related directories and files
path_to_cromwell_secrets = f"{Path.home()}/.cromwell/cromwell-secrets.json" # CHANGE THIS
workflow_dir = f"{Path.home()}/scing/bin/cellranger-arc-2.0.0" # CHANGE THIS
path_to_exec = f"{workflow_dir}/submit.sh" # CHANGE THIS FOR SHARP
config_dir = f"{workflow_dir}/configs"
path_to_options = f"{workflow_dir}/{prefix}.options.aws.json"

# Other file locations
db_credentials_path = f"{Path.home()}/.config.json" # CHANGE THIS

In [10]:
# Set credentials based on SCRIdb CLI config file
with open(db_credentials_path) as f:
    creds = json.load(f)

In [16]:
# Replace this with the function which generated it
samples = '''
+-----------+--------------------------------+---------+------------+-------------------------------------------------------------------------------------------+
| SAMPLE_ID | SAMPLE_NAME                    | REQ_ID  | gIndex     | S3                                                                                        |
+-----------+--------------------------------+---------+------------+-------------------------------------------------------------------------------------------+
|      2748 | D11_MP150Cre_5_multiome_ATAC   | SZ-896  | NULL       | s3://dp-lab-data/collaborators/sawyers/OrgP53RbMultiomics/D11_MP150Cre_5_multiome_ATAC/   |
|      2556 | D11_MP150Cre_5_multiome        | SZ-896  | mm10-3.0.0 | s3://dp-lab-data/collaborators/sawyers/OrgP53RbMultiomics/D11_MP150Cre_5_multiome/        |
|      2749 | D14_MP150CRE_6_multiome_ATAC   | SZ-897  | NULL       | s3://dp-lab-data/collaborators/sawyers/OrgP53RbMultiomics/D14_MP150CRE_6_multiome_ATAC/   |
|      2564 | D14_MP150CRE_6_multiome        | SZ-897  | mm10-3.0.0 | s3://dp-lab-data/collaborators/sawyers/OrgP53RbMultiomics/D14_MP150CRE_6_multiome/        |
|      2750 | D18_MP150CRE_7_multiome_ATAC   | SZ-975  | NULL       | s3://dp-lab-data/collaborators/sawyers/OrgP53RbMultiomics/D18_MP150CRE_7_multiome_ATAC/   |
|      2570 | D18_MP150CRE_7_multiome        | SZ-975  | mm10-3.0.0 | s3://dp-lab-data/collaborators/sawyers/OrgP53RbMultiomics/D18_MP150CRE_7_multiome/        |
|      2751 | D21_MP150CRE_8_multiome_ATAC   | SZ-976  | NULL       | s3://dp-lab-data/collaborators/sawyers/OrgP53RbMultiomics/D21_MP150CRE_8_multiome_ATAC/   |
|      2576 | D21_MP150CRE_8_multiome        | SZ-976  | mm10-3.0.0 | s3://dp-lab-data/collaborators/sawyers/OrgP53RbMultiomics/D21_MP150CRE_8_multiome/        |
+-----------+--------------------------------+---------+------------+-------------------------------------------------------------------------------------------+
'''

## Execution

In [12]:
# Convert formatted string to DataFrame
table_fmt = re.compile("(?!^$)(^(?!(\+-*)+\+$))")
rows = list(filter(table_fmt.match, samples.split('\n')))
data = [[item.strip() for item in row.strip("|").split("|")] for row in rows]
df = pd.DataFrame(
    data = data[1:],
    columns = data[0]
)
df

Unnamed: 0,SAMPLE_ID,SAMPLE_NAME,REQ_ID,gIndex,S3
0,2748,D11_MP150Cre_5_multiome_ATAC,SZ-896,,s3://dp-lab-data/collaborators/sawyers/OrgP53R...
1,2556,D11_MP150Cre_5_multiome,SZ-896,mm10-3.0.0,s3://dp-lab-data/collaborators/sawyers/OrgP53R...
2,2749,D14_MP150CRE_6_multiome_ATAC,SZ-897,,s3://dp-lab-data/collaborators/sawyers/OrgP53R...
3,2564,D14_MP150CRE_6_multiome,SZ-897,mm10-3.0.0,s3://dp-lab-data/collaborators/sawyers/OrgP53R...
4,2750,D18_MP150CRE_7_multiome_ATAC,SZ-975,,s3://dp-lab-data/collaborators/sawyers/OrgP53R...
5,2570,D18_MP150CRE_7_multiome,SZ-975,mm10-3.0.0,s3://dp-lab-data/collaborators/sawyers/OrgP53R...
6,2751,D21_MP150CRE_8_multiome_ATAC,SZ-976,,s3://dp-lab-data/collaborators/sawyers/OrgP53R...
7,2576,D21_MP150CRE_8_multiome,SZ-976,mm10-3.0.0,s3://dp-lab-data/collaborators/sawyers/OrgP53R...


In [14]:
# Each Req ID is a collection of libraries for one project

stdouts = [] # to store all outputs
process = False

with tqdm(df.groupby('REQ_ID')) as t:

    for name, g in t:

        # ********************
        # Modify per workflow

        is_atac = g["S3"].str.contains("ATAC") # 2 rows per group, each is either ATAC or GEX sample
        gex_path = S3Path.from_uri(g.loc[~is_atac, "S3"].iloc[0])
        atac_path = S3Path.from_uri(g.loc[is_atac, "S3"].iloc[0])
        fastq_data = zip(
            ['gex', 'atac'], # prefixes for FASTQ inputs, e.g. {prefix}FastqFiles
            [GEX_reads, ATAC_reads], # reads relevant to each library, defined above
            [gex_path, atac_path], # paths to FASTQ parent folder for each library
        )

        # ********************

        # Add inputs to dictionary
        inputs = dict()

        # Set Run ID to GEX sample name
        sample_name = g.loc[~is_atac, "SAMPLE_NAME"].iloc[0] # Also used in labels below
        inputs[f"{prefix}.runID"] = sample_name
        inputs[f"{prefix}.dockerRegistry"] = common_docker_registry

        # Get species from database to decide reference
        sample_id = g["SAMPLE_ID"].iloc[0]
        species = get_species(sample_id, creds["user"], creds["password"])
        ref = get_reference(species) # TODO: 'get_reference' should be replaced with a more comprehensive mapping
        inputs[f"{prefix}.reference"] = ref

        # For each library, assemble inputs
        for library, reads, s3_path in fastq_data:
            # Add FASTQ-related inputs 
            inputs[f"{prefix}.{library}FastqFiles"] = []

            bucket = s3_path.parts[1]
            key = '/'.join(s3_path.parts[2:])+"/"
            for r in reads:
                fastq_re = re.compile(f"{r}_\d{{3}}.fastq.gz$")
                fastqs = get_s3_objects(bucket, key, fastq_re, full_uri=True)
                inputs[f"{prefix}.{library}FastqFiles"] += fastqs

            # Note: FASTQ name is file name up to lane id (e.g. L001, L002, etc.)
            fastq_name_re = r".*/(.*)_S\d+_L\d{3}_[A-Za-z]\d_\d{3}.fastq.gz$"
            fastq_name = re.match(fastq_name_re, inputs[f"{prefix}.{library}FastqFiles"][0])[1]
            inputs[f"{prefix}.{library}FastqName"] = fastq_name

        # Add labels to dictionary
        labels = dict()
        labels["pipelineType"] = pipeline_type
        labels["project"] = get_project_id(sample_id, creds["user"], creds["password"])
        labels["sample"] = sample_name
        labels["owner"] = creds["user"]
        labels["destination"] = str((gex_path/output_dirname).as_uri())
        labels["transfer"] = "-"
        labels["comment"] = creds["user"]

        # Write inputs and labels to file
        path_to_inputs = f"{config_dir}/{sample_name}.inputs.json"
        with open(path_to_inputs, "w") as f_inputs:
            json.dump(inputs, f_inputs, indent=4)

        path_to_labels = f"{config_dir}/{sample_name}.labels.json"
        with open(path_to_labels, "w") as f_labels:
            json.dump(labels, f_labels, indent=4)

        if process:
            stdouts.append(run(
                workflow_path = workflow_dir,
                execp = "submit.sh",
                secrets = path_to_cromwell_secrets,
                inputs = path_to_inputs,
                labels = path_to_labels,
                options = path_to_options,
            ))

  0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
stdouts

[]