In [1]:
import pathlib
import subprocess
import os

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

### Read alignment using Bowtie2

In [3]:
def build_index(ref_file, bowtie_ref):
    cmd = f"bowtie2-build {ref_file} {bowtie_ref} --threads {NUM_THREADS}"
    subprocess.check_call(cmd, shell=True)

In [4]:
def run_kallisto_quant(fastq_1, fastq_2, out_path, sra, kallisto_idx_path, strand, strand_param):
    ref_name=kallisto_idx_path.split('/')[-1].split('.idx')[0].strip()
    out_file= f"{out_path}kallisto_{sra}_{ref_name}.{strand}"
    cmd = f"kallisto quant -i {kallisto_idx_path} {fastq_1} {fastq_2} -o {out_file} {strand_param}"
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [5]:
def run_kallisto_idx(ref_path, kallisto_idx_path):
    idx_name=ref_path.split('/')[-1].split('.fa')[0]+'.idx'
    kallisto_idx= kallisto_idx_path+idx_name
    cmd = f"kallisto index -i {kallisto_idx} {ref_path}"
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [6]:
def workflow_paired(ref_file):
    if not os.path.isdir(PRJ_OUT_PATH):
        pathlib.Path(PRJ_OUT_PATH).mkdir(exist_ok=True)
    f = open(ref_file, "r")
    references=[line.rstrip() for line in f]
    
    for ref in references:
        print(ref)
        idx_name=ref.split('.fa')[0]+'.idx'
        kallisto_idx= KALLISTO_IDX_PATH+idx_name
        if not os.path.isfile(kallisto_idx):
            run_kallisto_idx(REF_PATH+ref, KALLISTO_IDX_PATH)
        for idx, sra in enumerate(SRAs):
            print(f'{sra}')
            fastq1=FASTQ_FILES[2*idx]
            fastq2=FASTQ_FILES[(2*idx)+1]

            f1path=BASE_PATH+fastq1
            f2path=BASE_PATH+fastq2

            align_path=PRJ_OUT_PATH+sra
            if not os.path.isdir(align_path):
                pathlib.Path(align_path).mkdir(exist_ok=True)
            kallisto_path=PRJ_OUT_PATH+sra+'/kallisto/'
            if not os.path.isdir(kallisto_path):
                pathlib.Path(kallisto_path).mkdir(exist_ok=True)
            for strand, strand_param in zip(['un','fr','rf'],['','--fr-stranded','--rf-stranded']):
                run_kallisto_quant(f1path, f2path, kallisto_path, sra, kallisto_idx, strand, strand_param)

In [7]:
PRJ='PRJNA795267'
REF_FILE=f'/mnt/1TB_0/Data/Assembly/{PRJ}/general_plots/kallisto_refs.txt'
REF_PATH='/mnt/1TB_0/Data/fasta/complete_nucleotide/'
KALLISTO_IDX_PATH=f'{REF_PATH}bwa_indexes/kallisto_index/'
PRJ_OUT_PATH=f'/mnt/1TB_0/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'

In [8]:
SRAs=['SRR17497116','SRR17497120','SRR17509984']
FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_val_1.fq')
    FASTQ_FILES.append(f+'_2_val_2.fq')
workflow_paired(REF_FILE)

MG772933_1_Bat_SARS-like_coronavirus_isolate_bat-SL-CoVZC45_complete_genome.fa
SRR17497116
SRR17497120
SRR17509984
gisaid_hcov-19_2021_07_14_12_EPI_ISL_410538.fa
SRR17497116
SRR17497120
SRR17509984
NC_001457_1.fa
SRR17497116
SRR17497120
SRR17509984
NC_043426_1.fa
SRR17497116
SRR17497120
SRR17509984

SRR17497116
SRR17497120
SRR17509984


In [9]:
REF_FILE=f'/mnt/1TB_0/Data/Assembly/{PRJ}/general_plots/kallisto_refs_mitochondrion.txt'
REF_PATH='/mnt/1TB_0/Data/fasta/mitochondria/'
KALLISTO_IDX_PATH=f'{REF_PATH}/kallisto_index/'
workflow_paired(REF_FILE)

NC_050263_1_Hystrix_brachyura.fa
SRR17497116
SRR17497120
SRR17509984
NC_012920_1_Homo_sapiens.fa
SRR17497116
SRR17497120
SRR17509984
