In [8]:
import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging
import pandas as pd
import numpy as np
from s3path import S3Path
from pathlib import Path
from tqdm.notebook import tqdm
from packaging import version
pd.set_option("display.max_colwidth", 40)

In [9]:
from utils.utils import *

# Process Samples

## Setup

In [10]:
# Location of docker files
common_docker_registry = "quay.io/hisplan"

prefix = "SeqcAda" # Workflow to run; also .wdl filename prefix
pipeline_type = prefix # field in *.labels.json
output_dirname = ""

# If need to add comment, put here
comment = ""

In [11]:
# Locations of workflow-related directories and files
path_to_cromwell_secrets = f"{Path.home()}/.cromwell/cromwell-secrets.json" # CHANGE THIS
workflow_dir = f"{Path.home()}/scing/bin/seqc-ada-0.0.3" # CHANGE THIS
path_to_exec = f"{workflow_dir}/submit.sh" # CHANGE THIS FOR SHARP
config_dir = f"{workflow_dir}/configs"
path_to_options = f"{workflow_dir}/{prefix}.options.aws.json"

# Other file locations
db_credentials_path = f"{Path.home()}/.config.json" # CHANGE THIS

In [12]:
# Set credentials based on SCRIdb CLI config file
with open(db_credentials_path) as f:
    creds = json.load(f)

In [25]:
# Samples on which to run CellRangerATAC
# Note: Assumes data is transferred to AWS S3 (this should be an s3 location)
# Note: Assumes directory name is name of sample
common_dir = "s3://dp-lab-data/collaborators/VanDenBrink"
samples = [
    'Thymic_regeneration/231_no_XRT_rep',
]
sample_paths = [
    f"{common_dir}/{sample}" for sample in samples
]

In [26]:
# Set path to transgene reference S3
# Note: This is an exceptional case
# path_to_reference = f"{common_dir}/transgene_reference/refdata-cellranger/3PS19_SNSEQ-GRCm38-Ensembl-87-transgenes.tar.gz"
path_to_reference = "https://cf.10xgenomics.com/supp/cell-exp/refdata-gex-mm10-2020-A.tar.gz"

## Execution

In [27]:
samples

Unnamed: 0,S3_Path,Sample_ID,FASTQs
231_no_XRT_rep,s3://dp-lab-data/collaborators/VanDe...,2070,{'All': ['s3://dp-lab-data/collabora...


In [29]:
# Get information for all samples
sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists
sample_names = [os.path.basename(s) for s in sample_paths]

print(sample_names)
samples = pd.DataFrame(
    sample_paths,
    index=sample_names,
    columns=["S3_Path"],
    dtype=str,
)
samples["Sample_ID"] = pd.Series(samples.index).apply(
    lambda x: get_sample_id(x, creds['user'], creds['password'])
).values

# Get FASTQ paths from S3
# Note: Uses same FASTQ file ids for all samples
#fastq_file_ids = fastq_map[prefix]
samples["FASTQs"] = samples["S3_Path"].apply(lambda x: get_fastqs(x, folder="barcode"))

# Get reference genome location
#samples["Reference"] = samples["Sample_ID"].apply(lambda x: get_cr_reference(x, prefix, creds["user"], creds["password"]))
#samples["Reference"] = path_to_reference

['231_no_XRT_rep']


In [145]:
# Load minimum inputs and labels fields from templates
with open(f"{config_dir}/template.inputs.json") as f:
    std_inputs_fields = list(json.load(f).keys())
    
with open(f"{config_dir}/template.labels.json") as f:
    std_labels_fields = list(json.load(f).keys())
    
# Annotate all samples with workflow inputs and labels
inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)
labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)

# Annotate inputs
inputs[f"{prefix}.sampleName"] = inputs.index # may need to change
inputs[f"{prefix}.inputFastq"] = samples["FASTQs"].apply(lambda x: np.ravel(list(x.values())))
inputs[f"{prefix}.fastqName"] = inputs[f"{prefix}.inputFastq"].apply(lambda x: get_fastqs_name(x))
inputs[f"{prefix}.referenceUrl"] = samples["Reference"]                        
inputs[f"{prefix}.includeIntrons"] = False
inputs[f"{prefix}.expectCells"] = 5000
inputs[f"{prefix}.memory"] = 256
inputs[f"{prefix}.dockerRegistry"] = common_docker_registry

# Annotate labels
labels["pipelineType"] = pipeline_type
labels["project"] = samples["Sample_ID"].apply(lambda x: get_project_id(x, creds["user"], creds["password"]))
labels["sample"] = labels.index
labels["owner"] = creds["user"]
labels["destination"] = samples['S3_Path'] + "/" + output_dirname
labels["transfer"] = "-"
labels["comment"] = creds["user"]

assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())
assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())

In [146]:
inputs

Unnamed: 0,CellRangerGex.sampleName,CellRangerGex.fastqName,CellRangerGex.inputFastq,CellRangerGex.referenceUrl,CellRangerGex.includeIntrons,CellRangerGex.expectCells,CellRangerGex.memory,CellRangerGex.dockerRegistry
SU-1358_C10_T2_on_treatment,SU-1358_C10_T2_on_treatment,3447_SU-1358_C10_T2_on_treatment_IGO...,[s3://dp-lab-data/sc-seq/Project_124...,https://cf.10xgenomics.com/supp/cell...,False,5000,256,quay.io/hisplan


In [149]:
labels

Unnamed: 0,pipelineType,project,sample,owner,destination,transfer,comment
SU-1358_C10_T2_on_treatment,CellRangerGex,POLAR,SU-1358_C10_T2_on_treatment,moormana,s3://dp-lab-data/sc-seq/Project_1243...,-,moormana


In [150]:
stdouts = [] # to store all outputs
process = True

with tqdm(samples.index) as t:

    for sample_name in t:

        # Write inputs and labels to file
        path_to_inputs = f"{config_dir}/{sample_name}.inputs.json"
        with open(path_to_inputs, "w") as f_inputs:
            json.dump(inputs.loc[sample_name].to_dict(), f_inputs, indent=4, cls=NpEncoder)

        path_to_labels = f"{config_dir}/{sample_name}.labels.json"
        with open(path_to_labels, "w") as f_labels:
            json.dump(labels.loc[sample_name].to_dict(), f_labels, indent=4, cls=NpEncoder)

        if process:
            stdouts.append(run(
                workflow_path = workflow_dir,
                execp = "submit.sh",
                secrets = path_to_cromwell_secrets,
                inputs = path_to_inputs,
                labels = path_to_labels,
                options = path_to_options,
            ))

  0%|          | 0/1 [00:00<?, ?it/s]

In [151]:
stdouts

[{'args': ['/Users/moormana/scing/bin/cellranger-gex-6.1.2/submit.sh',
   '-k',
   '/Users/moormana/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/moormana/scing/bin/cellranger-gex-6.1.2/configs/SU-1358_C10_T2_on_treatment.inputs.json',
   '-l',
   '/Users/moormana/scing/bin/cellranger-gex-6.1.2/configs/SU-1358_C10_T2_on_treatment.labels.json',
   '-o',
   '/Users/moormana/scing/bin/cellranger-gex-6.1.2/CellRangerGex.options.aws.json'],
  'returncode': 0,
  'stdout': '{"id":"1931681b-646b-4ba9-ac7e-01816b9d30cc","status":"Submitted"}\n',
  'stderr': ''}]