In [1]:
import re, subprocess, boto3, json, shlex, mysql, os, urllib
import pandas as pd
import numpy as np
from s3path import S3Path
from pathlib import Path
from tqdm.notebook import tqdm
from packaging import version
pd.set_option("display.max_colwidth", 40)

In [2]:
from utils.utils import *

# Process Samples

## Setup

In [9]:
# FASTQ reads/indices required for each workflow
fastq_map = {
    'Hashtag': ['R1', 'R2'],
    'CiteSeq': ['R1', 'R2'],
    'AsapSeq': ['R1', 'R2', 'R3'],
    'CellRangerCellPlex': ['R1', 'R2'],
}

In [10]:
# Location of docker files
common_docker_registry = "quay.io/hisplan"

prefix = "CellRangerCellPlex" # Workflow to run; also .wdl filename prefix
pipeline_type = prefix # field in *.labels.json
output_dirname = "cr-multi-results"

# If need to add comment, put here
comment = ""

In [11]:
# Locations of workflow-related directories and files
path_to_cromwell_secrets = f"{Path.home()}/.cromwell/cromwell-secrets.json" # CHANGE THIS
workflow_dir = f"{Path.home()}/scing/bin/cellranger-cellplex-6.1.2" # CHANGE THIS
path_to_exec = f"{workflow_dir}/submit.sh" # CHANGE THIS FOR SHARP
config_dir = f"{workflow_dir}/configs"
path_to_options = f"{workflow_dir}/{prefix}.options.aws.json"

# Other file locations
db_credentials_path = f"{Path.home()}/.config.json" # CHANGE THIS
barcodes_path = f"{Path.home()}/scing/data/barcodes" # CHANGE THIS

In [12]:
# Set credentials based on SCRIdb CLI config file
with open(db_credentials_path) as f:
    creds = json.load(f)

In [22]:
!aws s3 ls s3://dp-lab-data/collaborators/priya/MemConsolidationVr/AT-2469_wt_control/

                           PRE barcode/
                           PRE genomic/
                           PRE seqc-results/


In [16]:
# Samples on which to run CellRangerATAC
# Note: Assumes data is transferred to AWS S3 (this should be an s3 location)
# Note: Assumes directory name is name of sample
common_dir = "s3://dp-lab-data/collaborators/priya/MemConsolidationVr"
samples = [
    "AT-2469_wt_control",
]
sample_paths = [
    f"{common_dir}/{sample}" for sample in samples
]

In [257]:
# Add path to custom reference genome if relevant
path_to_custom_ref = "https://dp-lab-data.s3.amazonaws.com/collaborators/joynera/EctopicActivationHhSignaling/refdata-cellranger/VE-ectopic-activation-of-HH-signaling-GRCm38-Ensembl-100-transgenes.tar.gz"

## Execution

In [28]:
# Get information for all samples
sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists
sample_names = [os.path.basename(s) for s in sample_paths]
#sample_names = [s.replace("Redo_", "") for s in sample_names]
samples = pd.DataFrame(
    sample_paths,
    index=sample_names,
    columns=["S3_Path"],
    dtype=str,
)
samples["Sample_ID"] = pd.Series(samples.index).apply(
    lambda x: get_sample_id(x, creds['user'], creds['password'])
).values

# Get FASTQ paths from S3
# Note: Uses same FASTQ file ids for all samples
fastq_file_ids = fastq_map[prefix]
samples["FASTQs_GEX"] = samples["S3_Path"].apply(lambda x: get_fastqs(x))
samples["FASTQs_CPL"] = samples["S3_Path"].apply(lambda x: get_fastqs(x + "_CPL", folder="FASTQ"))

# Get reference genome location
samples["Reference"] = samples["Sample_ID"].apply(lambda x: get_cr_reference(x, prefix, creds["user"], creds["password"]))
#samples["Reference"] = path_to_custom_ref

In [30]:
cmo_files = get_cmo_files(samples, creds['user'], creds['password'])

In [31]:
for name, files in tqdm(cmo_files.items()):
    
    for file, ftype in zip(files, ['map', 'ref']):
        
        # Set file locations
        filename = f"{name}.cmo-{ftype}.csv"
        local_path = f"{barcodes_path}/{filename}"
        s3_path = f"{samples.loc[name, 'S3_Path']}/{output_dirname}/{filename}"
        
        # Write to file and upload
        file.to_csv(local_path, index=False) # Save to file
        subprocess.run(f"aws s3 cp {local_path} {s3_path} --quiet".split()) # Upload to S3
        samples.loc[name, f"CMO_{ftype}"] = s3_path # Modify samples

  0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
# Load minimum inputs and labels fields from templates
with open(f"{config_dir}/template.inputs.json") as f:
    std_inputs_fields = list(json.load(f).keys())
    
with open(f"{config_dir}/template.labels.json") as f:
    std_labels_fields = list(json.load(f).keys())
    
# Annotate all samples with workflow inputs and labels
inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)
labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)

# Annotate inputs
inputs[f"{prefix}.runName"] = inputs.index # may need to change
inputs[f"{prefix}.gexFastqFiles"] = samples["FASTQs_GEX"].apply(lambda x: np.ravel(list(x.values())))
inputs[f"{prefix}.gexFastqName"] = inputs[f"{prefix}.gexFastqFiles"].apply(lambda x: get_fastqs_name(x))
inputs[f"{prefix}.muxFastqFiles"] = samples["FASTQs_CPL"].apply(lambda x: np.ravel(list(x.values())))
inputs[f"{prefix}.muxFastqName"] = inputs[f"{prefix}.muxFastqFiles"].apply(lambda x: get_fastqs_name(x))
inputs[f"{prefix}.sampleCmoMap"] = samples["CMO_map"]
inputs[f"{prefix}.cmoReference"] = samples["CMO_ref"]
inputs[f"{prefix}.minAssignmentConfidence"] = 0.9
inputs[f"{prefix}.includeIntrons"] = False
inputs[f"{prefix}.expectCells"] = 3000
inputs[f"{prefix}.reference"] = samples["Reference"]                                 
inputs[f"{prefix}.dockerRegistry"] = common_docker_registry

# Annotate labels
labels["pipelineType"] = pipeline_type
labels["project"] = samples["Sample_ID"].apply(lambda x: get_project_id(x, creds["user"], creds["password"]))
labels["sample"] = labels.index
labels["owner"] = creds["user"]
labels["destination"] = samples['S3_Path'] + "/" + output_dirname
labels["transfer"] = "-"
labels["comment"] = creds["user"]

assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())
assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())

In [33]:
inputs

Unnamed: 0,CellRangerCellPlex.runName,CellRangerCellPlex.gexFastqName,CellRangerCellPlex.gexFastqFiles,CellRangerCellPlex.muxFastqName,CellRangerCellPlex.muxFastqFiles,CellRangerCellPlex.minAssignmentConfidence,CellRangerCellPlex.cmoReference,CellRangerCellPlex.sampleCmoMap,CellRangerCellPlex.reference,CellRangerCellPlex.includeIntrons,CellRangerCellPlex.expectCells,CellRangerCellPlex.dockerRegistry
AT-2469_wt_control,AT-2469_wt_control,5081_AT-2469_wt_control_IGO_12437_DM_23,[s3://dp-lab-data/collaborators/priy...,5081_AT-2469_wt_control_CPL_IGO_1243...,[s3://dp-lab-data/collaborators/priy...,0.9,s3://dp-lab-data/collaborators/priya...,s3://dp-lab-data/collaborators/priya...,https://cf.10xgenomics.com/supp/cell...,False,3000,quay.io/hisplan


In [34]:
labels

Unnamed: 0,pipelineType,project,sample,owner,destination,transfer,comment
AT-2469_wt_control,CellRangerCellPlex,Memory consolidation VR,AT-2469_wt_control,moormana,s3://dp-lab-data/collaborators/priya...,-,moormana


In [35]:
stdouts = [] # to store all outputs
process = True

with tqdm(samples.index) as t:

    for sample_name in t:

        # Write inputs and labels to file
        path_to_inputs = f"{config_dir}/{sample_name}.inputs.json"
        with open(path_to_inputs, "w") as f_inputs:
            json.dump(inputs.loc[sample_name].to_dict(), f_inputs, indent=4, cls=NpEncoder)

        path_to_labels = f"{config_dir}/{sample_name}.labels.json"
        with open(path_to_labels, "w") as f_labels:
            json.dump(labels.loc[sample_name].to_dict(), f_labels, indent=4, cls=NpEncoder)

        if process:
            stdouts.append(run(
                workflow_path = workflow_dir,
                execp = "submit.sh",
                secrets = path_to_cromwell_secrets,
                inputs = path_to_inputs,
                labels = path_to_labels,
                options = path_to_options,
            ))

  0%|          | 0/1 [00:00<?, ?it/s]

In [36]:
stdouts

[{'args': ['/Users/moormana/scing/bin/cellranger-cellplex-6.1.2/submit.sh',
   '-k',
   '/Users/moormana/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/moormana/scing/bin/cellranger-cellplex-6.1.2/configs/AT-2469_wt_control.inputs.json',
   '-l',
   '/Users/moormana/scing/bin/cellranger-cellplex-6.1.2/configs/AT-2469_wt_control.labels.json',
   '-o',
   '/Users/moormana/scing/bin/cellranger-cellplex-6.1.2/CellRangerCellPlex.options.aws.json'],
  'returncode': 0,
  'stdout': '{"id":"c442cbce-0c4c-4e87-ab1f-dc58accde173","status":"Submitted"}\n',
  'stderr': ''}]