In [98]:
import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging
import pandas as pd
import numpy as np
from s3path import S3Path
from pathlib import Path
from tqdm.notebook import tqdm
from packaging import version
pd.set_option("display.max_colwidth", 40)

In [53]:
from utils.utils import *

# Define Helper Functions

In [32]:
# FASTQ reads/indices required for each workflow
fastq_map = {
    'Hashtag': ['R1','R2'],
    'CiteSeq': ['R1','R2'],
    'AsapSeq': ['R1','R2','R3'],
    'CellRangerATAC': ['I1','R1','R2','R3'],
    'CellRangerGex': ['I1','R1','R2'],
    'SpaceRanger': ['I1', 'I2', 'R1', 'R2'],
}

# Process Samples

## Setup

In [54]:
# Location of docker files
common_docker_registry = "quay.io/hisplan"

prefix = "SpaceRanger" # Workflow to run; also .wdl filename prefix
pipeline_type = prefix # field in *.labels.json
output_dirname = "sr-results"

# If need to add comment, put here
comment = ""

In [48]:
# Locations of workflow-related directories and files
path_to_cromwell_secrets = f"{Path.home()}/.cromwell/cromwell-secrets.json" # CHANGE THIS
workflow_dir = f"{Path.home()}/scing/bin/spaceranger-1.3.1" # CHANGE THIS
path_to_exec = f"{workflow_dir}/submit.sh" # CHANGE THIS FOR SHARP
config_dir = f"{workflow_dir}/configs"
path_to_options = f"{workflow_dir}/{prefix}.options.aws.json"

# Other file locations
db_credentials_path = f"{Path.home()}/.config.json" # CHANGE THIS

In [49]:
# Set credentials based on SCRIdb CLI config file
with open(db_credentials_path) as f:
    creds = json.load(f)

In [50]:
# Samples on which to run CellRangerATAC
# Note: Assumes data is transferred to AWS S3 (this should be an s3 location)
# Note: Assumes directory name is name of sample
common_dir = "s3://dp-lab-data/collaborators/whiter/VisiumPattonlab"
samples = [
    'MH-1505_AR1_V19B23-118_A1',
    'MH-1505_AR2_V19B23-118_B1',
    'MH-1505_AR3_V19B23-118_C1',
    'MH-1505_AR4_V19B23-118_D1',
]
sample_paths = [
    f"{common_dir}/{sample}" for sample in samples
]

In [57]:
# Set path to transgene reference S3
# Note: This is an exceptional case
reference_uri = "https://dp-lab-data.s3.amazonaws.com/collaborators/whiter/VisiumPattonlab/refdata/refdata-danio-rerio-20211005.tar.gz"

In [69]:
# Set path to Visium images and slide data csv
path_to_slides = f"{Path.home()}/scing/data/visium_images/V19B23-118"

## Execution

In [101]:
# Get information for all samples
sample_paths = [s.strip('/') for s in sample_paths] # remove trailing slash if exists
sample_names = [os.path.basename(s) for s in sample_paths]

print(sample_names)
samples = pd.DataFrame(
    sample_paths,
    index=sample_names,
    columns=["S3_Path"],
    dtype=str,
)
samples["Sample_ID"] = pd.Series(samples.index).apply(
    lambda x: get_sample_id(x, creds['user'], creds['password'])
).values

# Get FASTQ paths from S3
# Note: Uses same FASTQ file ids for all samples
fastq_file_ids = fastq_map[prefix]
samples["FASTQs"] = samples["S3_Path"].apply(lambda x: get_fastqs(x, fastq_file_ids))

# Set reference explicitly
samples["Reference"] = reference_uri

['MH-1505_AR1_V19B23-118_A1', 'MH-1505_AR2_V19B23-118_B1', 'MH-1505_AR3_V19B23-118_C1', 'MH-1505_AR4_V19B23-118_D1']


In [102]:
# Get slide information for all samples
slide_data = pd.read_csv(f"{path_to_slides}/slide_data.csv", index_col=0)
assert all([image in os.listdir(path_to_slides) for image in slide_data['image']])

samples["Serial_Number"] = samples["Sample_ID"].map(slide_data["serial_number"])
samples["Area_ID"] = samples["Sample_ID"].map(slide_data["area_id"])
samples["Image_Path"] = samples["S3_Path"] + "/" + output_dirname + "/" + samples["Sample_ID"].map(slide_data["image"])

# Upload images to S3
for image, s3_path in tqdm(zip(slide_data["image"], samples["Image_Path"])):
    assert image in os.listdir(path_to_slides)
    subprocess.run(f"aws s3 sync {path_to_slides}/{image} {s3_path} --quiet".split())

0it [00:00, ?it/s]

In [103]:
samples

Unnamed: 0,S3_Path,Sample_ID,FASTQs,Reference,Serial_Number,Area_ID,Image_Path
MH-1505_AR1_V19B23-118_A1,s3://dp-lab-data/collaborators/white...,3507,{'I1': ['s3://dp-lab-data/collaborat...,https://dp-lab-data.s3.amazonaws.com...,V19B23,A1,s3://dp-lab-data/collaborators/white...
MH-1505_AR2_V19B23-118_B1,s3://dp-lab-data/collaborators/white...,3508,{'I1': ['s3://dp-lab-data/collaborat...,https://dp-lab-data.s3.amazonaws.com...,V19B23,B1,s3://dp-lab-data/collaborators/white...
MH-1505_AR3_V19B23-118_C1,s3://dp-lab-data/collaborators/white...,3509,{'I1': ['s3://dp-lab-data/collaborat...,https://dp-lab-data.s3.amazonaws.com...,V19B23,C1,s3://dp-lab-data/collaborators/white...
MH-1505_AR4_V19B23-118_D1,s3://dp-lab-data/collaborators/white...,3510,{'I1': ['s3://dp-lab-data/collaborat...,https://dp-lab-data.s3.amazonaws.com...,V19B23,D1,s3://dp-lab-data/collaborators/white...


In [105]:
# Load minimum inputs and labels fields from templates
with open(f"{config_dir}/template.inputs.json") as f:
    std_inputs_fields = list(json.load(f).keys())
    
with open(f"{config_dir}/template.labels.json") as f:
    std_labels_fields = list(json.load(f).keys())
    
# Annotate all samples with workflow inputs and labels
inputs = pd.DataFrame(index=samples.index, columns=std_inputs_fields,)
labels = pd.DataFrame(index=samples.index, columns=std_labels_fields,)

# Annotate inputs
inputs[f"{prefix}.sampleName"] = inputs.index # may need to change
inputs[f"{prefix}.inputFastq"] = samples["FASTQs"].apply(lambda x: np.ravel(list(x.values())))
inputs[f"{prefix}.fastqName"] = inputs[f"{prefix}.inputFastq"].apply(lambda x: get_fastqs_name(x))
inputs[f"{prefix}.referenceUrl"] = samples["Reference"]  
inputs[f"{prefix}.heImage"] = samples["Image_Path"]
inputs[f"{prefix}.slideSerialNumber"] = samples["Serial_Number"]
inputs[f"{prefix}.areaId"] = samples["Area_ID"]
inputs[f"{prefix}.reorientImages"] = True
inputs[f"{prefix}.dockerRegistry"] = common_docker_registry

# Annotate labels
labels["pipelineType"] = pipeline_type
labels["project"] = samples["Sample_ID"].apply(lambda x: get_project_id(x, creds["user"], creds["password"]))
labels["sample"] = labels.index
labels["owner"] = creds["user"]
labels["destination"] = samples["S3_Path"] + "/" + output_dirname
labels["transfer"] = "-"
labels["comment"] = creds["user"]

assert (std_inputs_fields == list(inputs.columns)) & (inputs.notna().values.all())
assert (std_labels_fields == list(labels.columns)) & (labels.notna().values.all())

In [106]:
inputs

Unnamed: 0,SpaceRanger.sampleName,SpaceRanger.fastqName,SpaceRanger.inputFastq,SpaceRanger.referenceUrl,SpaceRanger.heImage,SpaceRanger.slideSerialNumber,SpaceRanger.areaId,SpaceRanger.reorientImages,SpaceRanger.dockerRegistry
MH-1505_AR1_V19B23-118_A1,MH-1505_AR1_V19B23-118_A1,3507_MH-1505_AR1_V19B23-118_A1_IGO_1...,[s3://dp-lab-data/collaborators/whit...,https://dp-lab-data.s3.amazonaws.com...,s3://dp-lab-data/collaborators/white...,V19B23,A1,True,quay.io/hisplan
MH-1505_AR2_V19B23-118_B1,MH-1505_AR2_V19B23-118_B1,3508_MH-1505_AR2_V19B23-118_B1_IGO_1...,[s3://dp-lab-data/collaborators/whit...,https://dp-lab-data.s3.amazonaws.com...,s3://dp-lab-data/collaborators/white...,V19B23,B1,True,quay.io/hisplan
MH-1505_AR3_V19B23-118_C1,MH-1505_AR3_V19B23-118_C1,3509_MH-1505_AR3_V19B23-118_C1_IGO_1...,[s3://dp-lab-data/collaborators/whit...,https://dp-lab-data.s3.amazonaws.com...,s3://dp-lab-data/collaborators/white...,V19B23,C1,True,quay.io/hisplan
MH-1505_AR4_V19B23-118_D1,MH-1505_AR4_V19B23-118_D1,3510_MH-1505_AR4_V19B23-118_D1_IGO_1...,[s3://dp-lab-data/collaborators/whit...,https://dp-lab-data.s3.amazonaws.com...,s3://dp-lab-data/collaborators/white...,V19B23,D1,True,quay.io/hisplan


In [107]:
labels

Unnamed: 0,pipelineType,project,sample,owner,destination,transfer,comment
MH-1505_AR1_V19B23-118_A1,SpaceRanger,Visium_Pattonlab,MH-1505_AR1_V19B23-118_A1,moormana,s3://dp-lab-data/collaborators/white...,-,moormana
MH-1505_AR2_V19B23-118_B1,SpaceRanger,Visium_Pattonlab,MH-1505_AR2_V19B23-118_B1,moormana,s3://dp-lab-data/collaborators/white...,-,moormana
MH-1505_AR3_V19B23-118_C1,SpaceRanger,Visium_Pattonlab,MH-1505_AR3_V19B23-118_C1,moormana,s3://dp-lab-data/collaborators/white...,-,moormana
MH-1505_AR4_V19B23-118_D1,SpaceRanger,Visium_Pattonlab,MH-1505_AR4_V19B23-118_D1,moormana,s3://dp-lab-data/collaborators/white...,-,moormana


In [108]:
stdouts = [] # to store all outputs
process = True

with tqdm(samples.index) as t:

    for sample_name in t:

        # Write inputs and labels to file
        path_to_inputs = f"{config_dir}/{sample_name}.inputs.json"
        with open(path_to_inputs, "w") as f_inputs:
            json.dump(inputs.loc[sample_name].to_dict(), f_inputs, indent=4, cls=NpEncoder)

        path_to_labels = f"{config_dir}/{sample_name}.labels.json"
        with open(path_to_labels, "w") as f_labels:
            json.dump(labels.loc[sample_name].to_dict(), f_labels, indent=4, cls=NpEncoder)

        if process:
            stdouts.append(run(
                workflow_path = workflow_dir,
                execp = "submit.sh",
                secrets = path_to_cromwell_secrets,
                inputs = path_to_inputs,
                labels = path_to_labels,
                options = path_to_options,
            ))

  0%|          | 0/4 [00:00<?, ?it/s]

In [109]:
stdouts

[{'args': ['/Users/moormana/scing/bin/spaceranger-1.3.1/submit.sh',
   '-k',
   '/Users/moormana/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/moormana/scing/bin/spaceranger-1.3.1/configs/MH-1505_AR1_V19B23-118_A1.inputs.json',
   '-l',
   '/Users/moormana/scing/bin/spaceranger-1.3.1/configs/MH-1505_AR1_V19B23-118_A1.labels.json',
   '-o',
   '/Users/moormana/scing/bin/spaceranger-1.3.1/SpaceRanger.options.aws.json'],
  'returncode': 0,
  'stdout': '{"id":"c219c328-3ce0-443f-b086-39adbb2c3724","status":"Submitted"}\n',
  'stderr': ''},
 {'args': ['/Users/moormana/scing/bin/spaceranger-1.3.1/submit.sh',
   '-k',
   '/Users/moormana/.cromwell/cromwell-secrets.json',
   '-i',
   '/Users/moormana/scing/bin/spaceranger-1.3.1/configs/MH-1505_AR2_V19B23-118_B1.inputs.json',
   '-l',
   '/Users/moormana/scing/bin/spaceranger-1.3.1/configs/MH-1505_AR2_V19B23-118_B1.labels.json',
   '-o',
   '/Users/moormana/scing/bin/spaceranger-1.3.1/SpaceRanger.options.aws.json'],
  'returncode': 0,
 