In [1]:
import re, subprocess, boto3, json, shlex, mysql, os, urllib
import pandas as pd
import numpy as np
from s3path import S3Path
from pathlib import Path
from tqdm.notebook import tqdm
from packaging import version
pd.set_option("display.max_colwidth", 40)

In [16]:
from utils.utils import *

# Process Samples

## Setup

In [3]:
# Location of docker files
common_docker_registry = "quay.io/hisplan"

prefix = "ArchRCR" # Workflow to run; also .wdl filename prefix
pipeline_type = prefix # field in *.labels.json
output_dirname = "archr-cr-results"

# If need to add comment, put here
comment = ""

In [29]:
# Locations of workflow-related directories and files
path_to_cromwell_secrets = f"{Path.home()}/.cromwell/cromwell-secrets.json" # CHANGE THIS
workflow_dir = f"{Path.home()}/scing/bin/wdl-ArchR" # CHANGE THIS
execp = "submit-fastq.sh" # CHANGE THIS FOR SHARP
config_dir = f"{workflow_dir}/configs"
path_to_options = f"{workflow_dir}/ArchR.options.aws.json"

# Other file locations
db_credentials_path = f"{Path.home()}/.config.json" # CHANGE THIS

In [5]:
# Set credentials based on SCRIdb CLI config file
with open(db_credentials_path) as f:
    creds = json.load(f)

In [6]:
# Samples on which to run CellRangerATAC
# Note: Assumes data is transferred to AWS S3 (this should be an s3 location)
# Note: Assumes directory name is name of sample
common_dir = "s3://dp-lab-data/collaborators/lowe/p53_loss_enrichment"
samples = [
    "JR-1497_p489c_shRen_caer_48h_multiome_1",
    "JR-1497_p489c_shRen_caer_48h_multiome_2",
    "JR-1497_p489c_shRen_caer_48h_multiome_3",
    "JR-1497_p489c_shRen_caer_48h_multiome_4",
]
sample_paths = [
    f"{common_dir}/{sample}" for sample in samples
]

## Execution

In [19]:
# Get information for all samples
sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists
sample_names = [os.path.basename(s) for s in sample_paths]
#sample_names = [s.replace("Redo_", "") for s in sample_names]
print(sample_names)
samples = pd.DataFrame(
    sample_paths,
    index=sample_names,
    columns=["S3_Path"],
    dtype=str,
)
samples["Sample_ID"] = pd.Series(samples.index).apply(
    lambda x: get_sample_id(x, creds['user'], creds['password'])
).values

# Get FASTQ paths from S3
# Note: Uses same FASTQ file ids for all samples
#fastq_file_ids = fastq_map[prefix]
samples["FASTQs"] = samples["S3_Path"].apply(
    lambda x: get_fastqs(x + "_ATAC", folder="FASTQ")
)

# Get reference genome location
samples["CR_Reference"] = samples["Sample_ID"].apply(
    lambda x: get_reference(x, "CellRanger", prefix, creds["user"], creds["password"])
)
samples["ArchR_Reference"] = samples["Sample_ID"].apply(
    lambda x: get_reference(x, "ArchR", prefix, creds["user"], creds["password"])
)

['JR-1497_p489c_shRen_caer_48h_multiome_1', 'JR-1497_p489c_shRen_caer_48h_multiome_2', 'JR-1497_p489c_shRen_caer_48h_multiome_3', 'JR-1497_p489c_shRen_caer_48h_multiome_4']


In [20]:
samples

Unnamed: 0,S3_Path,Sample_ID,FASTQs,CR_Reference,ArchR_Reference
JR-1497_p489c_shRen_caer_48h_multiome_1,s3://dp-lab-data/collaborators/lowe/...,3515,{'All': ['s3://dp-lab-data/collabora...,https://cf.10xgenomics.com/supp/cell...,mm10
JR-1497_p489c_shRen_caer_48h_multiome_2,s3://dp-lab-data/collaborators/lowe/...,3516,{'All': ['s3://dp-lab-data/collabora...,https://cf.10xgenomics.com/supp/cell...,mm10
JR-1497_p489c_shRen_caer_48h_multiome_3,s3://dp-lab-data/collaborators/lowe/...,3517,{'All': ['s3://dp-lab-data/collabora...,https://cf.10xgenomics.com/supp/cell...,mm10
JR-1497_p489c_shRen_caer_48h_multiome_4,s3://dp-lab-data/collaborators/lowe/...,3518,{'All': ['s3://dp-lab-data/collabora...,https://cf.10xgenomics.com/supp/cell...,mm10


In [21]:
# Load minimum inputs and labels fields from templates
with open(f"{config_dir}/template.fastq.inputs.json") as f:
    std_inputs_fields = list(json.load(f).keys())
    
with open(f"{config_dir}/template.fastq.labels.json") as f:
    std_labels_fields = list(json.load(f).keys())
    
# Annotate all samples with workflow inputs and labels
inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)
labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)

# Annotate inputs
inputs[f"{prefix}.sampleName"] = inputs.index # may need to change
inputs[f"{prefix}.fastqFiles"] = samples["FASTQs"].apply(lambda x: np.ravel(list(x.values())))
inputs[f"{prefix}.fastqNames"] = inputs[f"{prefix}.fastqFiles"].apply(lambda x: get_fastqs_name(x))
inputs[f"{prefix}.genomeCellRanger"] = samples["CR_Reference"].apply(
    lambda x: {
        "name": re.match(r'.*/refdata-cellranger-[a-z]+-(.*).tar.gz$', x)[1],
        "location": x,
    }
)
inputs[f"{prefix}.genomeArchR"] = samples["ArchR_Reference"]
inputs[f"{prefix}.numCores"] = 1
inputs[f"{prefix}.dockerRegistry"] = common_docker_registry

# Annotate labels
labels["pipelineType"] = "CellRangerARC"
labels["project"] = samples["Sample_ID"].apply(lambda x: get_project_id(x, creds["user"], creds["password"]))
labels["sample"] = labels.index
labels["owner"] = creds["user"]
labels["destination"] = samples['S3_Path'] + "/" + output_dirname
labels["transfer"] = "-"
labels["comment"] = creds["user"]

assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())
assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())

In [22]:
inputs

Unnamed: 0,ArchRCR.sampleName,ArchRCR.fastqNames,ArchRCR.fastqFiles,ArchRCR.genomeCellRanger,ArchRCR.genomeArchR,ArchRCR.numCores,ArchRCR.dockerRegistry
JR-1497_p489c_shRen_caer_48h_multiome_1,JR-1497_p489c_shRen_caer_48h_multiome_1,3519_JR-1497_p489c_shRen_caer_48h_mu...,[s3://dp-lab-data/collaborators/lowe...,"{'name': 'mm10-2020-A-2.0.0', 'locat...",mm10,1,quay.io/hisplan
JR-1497_p489c_shRen_caer_48h_multiome_2,JR-1497_p489c_shRen_caer_48h_multiome_2,3520_JR-1497_p489c_shRen_caer_48h_mu...,[s3://dp-lab-data/collaborators/lowe...,"{'name': 'mm10-2020-A-2.0.0', 'locat...",mm10,1,quay.io/hisplan
JR-1497_p489c_shRen_caer_48h_multiome_3,JR-1497_p489c_shRen_caer_48h_multiome_3,3521_JR-1497_p489c_shRen_caer_48h_mu...,[s3://dp-lab-data/collaborators/lowe...,"{'name': 'mm10-2020-A-2.0.0', 'locat...",mm10,1,quay.io/hisplan
JR-1497_p489c_shRen_caer_48h_multiome_4,JR-1497_p489c_shRen_caer_48h_multiome_4,3522_JR-1497_p489c_shRen_caer_48h_mu...,[s3://dp-lab-data/collaborators/lowe...,"{'name': 'mm10-2020-A-2.0.0', 'locat...",mm10,1,quay.io/hisplan


In [23]:
labels

Unnamed: 0,pipelineType,project,sample,owner,destination,transfer,comment
JR-1497_p489c_shRen_caer_48h_multiome_1,CellRangerARC,p53_loss_enrichment,JR-1497_p489c_shRen_caer_48h_multiome_1,moormana,s3://dp-lab-data/collaborators/lowe/...,-,moormana
JR-1497_p489c_shRen_caer_48h_multiome_2,CellRangerARC,p53_loss_enrichment,JR-1497_p489c_shRen_caer_48h_multiome_2,moormana,s3://dp-lab-data/collaborators/lowe/...,-,moormana
JR-1497_p489c_shRen_caer_48h_multiome_3,CellRangerARC,p53_loss_enrichment,JR-1497_p489c_shRen_caer_48h_multiome_3,moormana,s3://dp-lab-data/collaborators/lowe/...,-,moormana
JR-1497_p489c_shRen_caer_48h_multiome_4,CellRangerARC,p53_loss_enrichment,JR-1497_p489c_shRen_caer_48h_multiome_4,moormana,s3://dp-lab-data/collaborators/lowe/...,-,moormana


In [30]:
stdouts = [] # to store all outputs
process = True

with tqdm(samples.index) as t:

    for sample_name in t:

        # Write inputs and labels to file
        path_to_inputs = f"{config_dir}/{sample_name}.inputs.json"
        with open(path_to_inputs, "w") as f_inputs:
            json.dump(inputs.loc[sample_name].to_dict(), f_inputs, indent=4, cls=NpEncoder)

        path_to_labels = f"{config_dir}/{sample_name}.labels.json"
        with open(path_to_labels, "w") as f_labels:
            json.dump(labels.loc[sample_name].to_dict(), f_labels, indent=4, cls=NpEncoder)

        if process:
            stdouts.append(run(
                workflow_path = workflow_dir,
                execp = execp,
                secrets = path_to_cromwell_secrets,
                inputs = path_to_inputs,
                labels = path_to_labels,
                options = path_to_options,
            ))

  0%|          | 0/4 [00:00<?, ?it/s]

In [31]:
stdouts

[{'args': ['/Users/moormana/scing/bin/wdl-ArchR/submit-fastq.sh',
   '-k',
   '/Users/moormana/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/moormana/scing/bin/wdl-ArchR/configs/JR-1497_p489c_shRen_caer_48h_multiome_1.inputs.json',
   '-l',
   '/Users/moormana/scing/bin/wdl-ArchR/configs/JR-1497_p489c_shRen_caer_48h_multiome_1.labels.json',
   '-o',
   '/Users/moormana/scing/bin/wdl-ArchR/ArchR.options.aws.json'],
  'returncode': 0,
  'stdout': '{"id":"ee2b98b7-dc08-4818-b634-e77135c23672","status":"Submitted"}\n',
  'stderr': ''},
 {'args': ['/Users/moormana/scing/bin/wdl-ArchR/submit-fastq.sh',
   '-k',
   '/Users/moormana/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/moormana/scing/bin/wdl-ArchR/configs/JR-1497_p489c_shRen_caer_48h_multiome_2.inputs.json',
   '-l',
   '/Users/moormana/scing/bin/wdl-ArchR/configs/JR-1497_p489c_shRen_caer_48h_multiome_2.labels.json',
   '-o',
   '/Users/moormana/scing/bin/wdl-ArchR/ArchR.options.aws.json'],
  'returncode': 0,
  'stdout