In [6]:
import os
import sys
import glob
import shutil
import subprocess
from pathlib import Path

In [7]:
# global variables
base_dir = Path.cwd().parent
seq_dir = base_dir+"seqdata/"
data_dir = base_dir+"data/"
soft_dir = base_dir+"soft/"
outdir = base_dir+"rna_starsolo.ensembl/"
if not os.path.exists(outdir):
    os.makedirs(outdir)

barcodes_fname = seq_dir+"spatial_barcodes.only.txt"

mmul10_fasta_fname = data_dir+"Macaca_mulatta.Mmul_10.dna.toplevel.fa"
mmul10_gff_fname = data_dir+"Macaca_mulatta.Mmul_10.110.gtf"
mmul10_resfeq_gff_fname = data_dir+"Mmul_10_genomic.gtf"

num_threads = 24

In [8]:
%load_ext autoreload
%autoreload 1

sys.path.insert(0, "../scripts/genomic_scripts/")
%aimport definitions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# open archives
for gz_fname in glob.glob(seq_dir+"RNA/globus/*/*gz"):
    subprocess.call(["gunzip",gz_fname])

In [12]:
from concurrent.futures import ThreadPoolExecutor

def execute_commands(commands):
    for cmd in commands:
        print(" ".join([str(x) for x in cmd]))
        subprocess.call([str(x) for x in cmd])

def run_parallel_commands(command_lists):
    with ThreadPoolExecutor() as executor:
        executor.map(execute_commands, command_lists)

In [None]:
for processed_fname in glob.glob(seq_dir+"RNA/globus/*/*processed.fastq"):
    filtered_fname = processed_fname.split("R2_processed.fastq")[0]+"R1_filtered.fastq"
    sample = processed_fname.rsplit("/",2)[-2]
    print(sample)
    if not os.path.exists(filtered_fname):
        continue
    
    if not os.path.exists(outdir+sample+"/"):
        os.makedirs(outdir+sample+"/")
        
    umi_fname = processed_fname
    cdna_fname = filtered_fname

    barcodes_fname = seq_dir+"spatial_barcodes.only.txt"

    ANN=mmul10_gff_fname.rstrip(".gtf")+".mod.type.gtf"

    # Output folder and experiment name
    OUTPUT=outdir+sample

    REF=mmul10_gff_fname.rstrip(".gtf")+".mod.type.STAR"
    WL=barcodes_fname
    CBLEN=16
    UMILEN=10
    STRAND="Forward"

    star_cmd = ["/ccb/salz4-4/avaraby/Kristen.ATAC_RNA/soft/STAR/Linux_x86_64_static/STAR",
           "--runThreadN","20",
           "--genomeDir",REF,
           "--readFilesIn",cdna_fname,umi_fname,
           "--outFileNamePrefix",OUTPUT+"/",
           "--outTmpDir",OUTPUT+".tmp/",
           "--soloType","CB_UMI_Simple",
           "--soloCBwhitelist",WL,
           "--soloCBstart","1",
           "--soloCBlen",str(CBLEN),
           "--soloUMIstart",str(CBLEN+1),
           "--soloUMIlen",str(UMILEN),
           "--soloStrand",STRAND,
           "--soloUMIdedup","1MM_CR",
           "--soloCBmatchWLtype","1MM_multi_Nbase_pseudocounts",
           "--soloUMIfiltering","MultiGeneUMI_CR",
           "--soloCellFilter","EmptyDrops_CR",
           "--outFilterScoreMin","30",
           "--clipAdapterType","CellRanger4",
           "--soloFeatures","Gene","GeneFull",
           "--soloOutFileNames","solo","features.tsv","barcodes.tsv","matrix.mtx",
           "--soloMultiMappers","EM"]
    
    subprocess.call(star_cmd)

In [None]:
# process missing samples

for processed_fname in glob.glob(seq_dir+"RNA/globus/*/*processed.fastq"):
    filtered_fname = processed_fname.split("R2_processed.fastq")[0]+"R1_filtered.fastq"
    sample = processed_fname.rsplit("/",2)[-2]
    
    if not sample in ["A0N5M7","RPN9IL"]:
        continue
    
    if not os.path.exists(filtered_fname):
        continue
    
    if not os.path.exists(outdir+sample+"/"):
        os.makedirs(outdir+sample+"/")
        
    umi_fname = processed_fname
    cdna_fname = filtered_fname

    barcodes_fname = seq_dir+"spatial_barcodes.only.txt"

    ANN=mmul10_gff_fname.rstrip(".gtf")+".mod.type.gtf"

    # Output folder and experiment name
    OUTPUT=outdir+sample

    REF=mmul10_gff_fname.rstrip(".gtf")+".mod.type.STAR"
    WL=barcodes_fname
    CBLEN=16
    UMILEN=10
    STRAND="Forward"

    star_cmd = ["/ccb/salz4-4/avaraby/Kristen.ATAC_RNA/soft/STAR/Linux_x86_64_static/STAR",
           "--runThreadN","20",
           "--genomeDir",REF,
           "--readFilesIn",cdna_fname,umi_fname,
           "--outFileNamePrefix",OUTPUT+"/",
           "--outTmpDir",OUTPUT+".tmp/",
           "--soloType","CB_UMI_Simple",
           "--soloCBwhitelist",WL,
           "--soloCBstart","1",
           "--soloCBlen",str(CBLEN),
           "--soloUMIstart",str(CBLEN+1),
           "--soloUMIlen",str(UMILEN),
           "--soloStrand",STRAND,
           "--soloUMIdedup","1MM_CR",
           "--soloCBmatchWLtype","1MM_multi_Nbase_pseudocounts",
           "--soloUMIfiltering","MultiGeneUMI_CR",
           "--soloCellFilter","EmptyDrops_CR",
           "--outFilterScoreMin","30",
           "--clipAdapterType","CellRanger4",
           "--soloFeatures","Gene","GeneFull",
           "--soloOutFileNames","solo","features.tsv","barcodes.tsv","matrix.mtx",
           "--soloMultiMappers","EM"]
    
    subprocess.call(star_cmd)

In [None]:
for processed_fname in glob.glob(seq_dir+"RNA/globus/*/*processed.fastq"):
    filtered_fname = processed_fname.split("R2_processed.fastq")[0]+"R1_filtered.fastq"
    sample = processed_fname.rsplit("/",2)[-2]
    print(sample)
    if not os.path.exists(filtered_fname):
        continue
    
    if not os.path.exists(outdir+sample+"/"):
        os.makedirs(outdir+sample+"/")
        
    OUTPUT=outdir+sample
    
    patterns = ['*.tsv', '*.mtx', '*.tsv', '*.mtx']
    matching_files = []
    for pattern in patterns:
        matching_files.extend(glob.glob(os.path.join(OUTPUT+"/*/*/", pattern)))

    for fname in matching_files:
        cmd = "gzip "+fname
        subprocess.call(cmd,shell=True)

In [None]:
# copy files for transfer

transfer_dir = "/ccb/salz7-home/avaraby1/Kristen.RNA.STARsolo/"

for processed_fname in glob.glob(seq_dir+"RNA/globus/*/*processed.fastq"):
    filtered_fname = processed_fname.split("R2_processed.fastq")[0]+"R1_filtered.fastq"
    sample = processed_fname.rsplit("/",2)[-2]
    print(sample)
    if not os.path.exists(filtered_fname):
        continue
    
    if not os.path.exists(transfer_dir+sample):
        os.makedirs(transfer_dir+sample)
        
    shutil.copytree(outdir+sample+"/soloGene",transfer_dir+sample+"/soloGene")
    shutil.copytree(outdir+sample+"/soloGeneFull",transfer_dir+sample+"/soloGeneFull")

In [None]:
# map coordinates onto the barcodes

transfer_dir = "/ccb/salz7-home/avaraby1/Kristen.RNA.STARsolo/"

for processed_fname in glob.glob(seq_dir+"RNA/globus/*/*processed.fastq"):
    filtered_fname = processed_fname.split("R2_processed.fastq")[0]+"R1_filtered.fastq"
    sample = processed_fname.rsplit("/",2)[-2]
    print(sample)
    if not os.path.exists(filtered_fname):
        continue
    
    cmd = [soft_dir+"barcode2coord.py",
                   transfer_dir+sample+"/soloGene/raw/barcodes.tsv.gz",
                   seq_dir+"spatial_barcodes.txt",
                   transfer_dir+sample+"/soloGene/raw/coordinates.tsv.gz"]
    subprocess.call(cmd)
    
    cmd = [soft_dir+"barcode2coord.py",
                   transfer_dir+sample+"/soloGeneFull/raw/barcodes.tsv.gz",
                   seq_dir+"spatial_barcodes.txt",
                   transfer_dir+sample+"/soloGeneFull/raw/coordinates.tsv.gz"]
    subprocess.call(cmd)
    
    cmd = [soft_dir+"barcode2coord.py",
                   transfer_dir+sample+"/soloGene/filtered/barcodes.tsv.gz",
                   seq_dir+"spatial_barcodes.txt",
                   transfer_dir+sample+"/soloGene/filtered/coordinates.tsv.gz"]
    subprocess.call(cmd)
    
    cmd = [soft_dir+"barcode2coord.py",
                   transfer_dir+sample+"/soloGeneFull/filtered/barcodes.tsv.gz",
                   seq_dir+"spatial_barcodes.txt",
                   transfer_dir+sample+"/soloGeneFull/filtered/coordinates.tsv.gz"]
    subprocess.call(cmd)