In [68]:
import re, subprocess, boto3, json, shlex, mysql, os, urllib, logging, yaml
import pandas as pd
import numpy as np
from s3path import S3Path
from pathlib import Path
from tqdm.notebook import tqdm
from packaging import version
pd.set_option("display.max_colwidth", 40)

In [2]:
from utils.utils import *

# Process Samples

## Setup

In [101]:
# Locations of workflow-related directories and files
path_to_keypair = f"{Path.home()}/.ssh/moormana.pem" # CHANGE THIS
workflow_dir = f"{Path.home()}/scing/bin/seqc-0.2.10" # CHANGE THIS
path_to_exec = f"{workflow_dir}/seqc_submit_mjobs.py"
config_dir = f"{workflow_dir}/config"
path_to_options = f"{workflow_dir}/config.sh"

# Other file locations
db_credentials_path = f"{Path.home()}/.config.json" # CHANGE THIS

In [5]:
# Set credentials based on SCRIdb CLI config file
with open(db_credentials_path) as f:
    creds = json.load(f)

In [79]:
# Folder containing samples on which to run SEQC
# Note: Assumes data is transferred to AWS S3 (this should be an s3 location)
common_dir = "s3://dp-lab-data/collaborators/VanDenBrink/Thymic_regeneration"

project = common_dir.split("/")[-1]
out = os.popen(f"aws s3 ls {common_dir}/").read()
samples = list(filter(lambda x: x != "PRE", out.split()))
samples = [s.strip('/') for s in samples]
sample_paths = [
    f"{common_dir}/{sample}" for sample in samples
]

In [106]:
"\n".join(samples)

'231_no_XRT_rep'

In [112]:
# Get AMI ID for appropriate version of SEQC
seqc_version = "0.2.9"

out = os.popen(f"bash {workflow_dir}/show-ami-list.sh").read().split()
versions = [re.match(r"seqc-v(\d+.\d+.\d+)", v)[1] for v in out[1::2]]
amis = out[:-1:2]
ami_id = dict(zip(versions, amis))[seqc_version]

In [113]:
ami_id

'ami-0c97def6c08694a9a'

## Execution

In [114]:
# Create YAML file for all SEQC jobs
jobs = {"jobs": []}

for i, (sample, s3_path) in tqdm(
    enumerate(zip(samples, sample_paths)), 
    total=len(samples)
):    
    # Job parameters for each sample
    job_params = dict()
    job_params['job'] = i+1
    job_params['ami-id'] = ami_id
    job_params['platform'] = 'ten_x_v3'
    job_params['user-tags'] = {'Job': i+1, 'Sample': sample, 'Project': project}
    job_params['index'] = 's3://seqc-public/genomes/mm38_long_polya/'
    job_params['barcode-files'] = 's3://seqc-public/barcodes/ten_x_v3/flat/'
    job_params['genomic-fastq'] = s3_path + "/genomic/" 
    job_params['barcode-fastq'] = s3_path + "/barcode/" 
    job_params['upload-prefix'] = s3_path + f"/seqc-{seqc_version}-results/"
    job_params['output-prefix'] = sample
    job_params['min-poly-t'] = "0"
    job_params['email'] = "andrewmoormanmskcc@gmail.com"
    job_params['star-args'] = 'runRNGseed=0'
    job_params['instance-type'] = 'r5.4xlarge'
    
    # Add job to YAML
    jobs["jobs"].append(job_params)

# Save to file
job_file = f'{workflow_dir}/config/{project}.yaml'
f = open(job_file, 'w+')
yaml.dump(jobs, f)

  0%|          | 0/1 [00:00<?, ?it/s]

In [115]:
# execute the pipeline command in Terminal
cmd = f"python {path_to_exec} --pem {path_to_keypair} --config {job_file}"
cmd

'python /Users/moormana/scing/bin/seqc-0.2.10/seqc_submit_mjobs.py --pem /Users/moormana/.ssh/moormana.pem --config /Users/moormana/scing/bin/seqc-0.2.10/config/Thymic_regeneration.yaml'