In [1]:
import pathlib
import subprocess
import os

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
def create_fq(contigs_path):
    try:
        if DE_NOVO=='MEGAHIT':
            cmd= f"{BBMAP_PATH}reformat.sh in={contigs_path}{CONTIG_MER}.contigs.fa out={contigs_path}{CONTIG_MER}.contigs.fq"
        elif DE_NOVO=='coronaspades':
            cmd= f"{BBMAP_PATH}reformat.sh in={contigs_path}{coronaspades_name}.fasta out={contigs_path}{coronaspades_name}.fq"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [4]:
def run_bwa(contigs, bwa_path, sra):
    ref=REF_NAME.split('.fa')[0]
    if DE_NOVO=='MEGAHIT':
        out_file= f"{bwa_path}{sra}_{CONTIGS_BASE_FOLDER}_{CONTIG_MER}_{ref}_{ALIGNER}.sam"
    elif DE_NOVO=='coronaspades':
        out_file= f"{bwa_path}{sra}_coronaspades_{coronaspades_code}_{ref}_{ALIGNER}.sam"
    try:
        cmd = f"{BWA_MEM} mem -t {NUM_THREADS} {REF_PATH+REF_NAME} {contigs} > {out_file}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [5]:
def gatk_sort(bwa_mem_out):
    out_file=bwa_mem_out.split('.sam')[0]
    out_file=out_file+'_gatk_sorted.sam'
    try:
        cmd=f"java -jar {GATK_JAR} SortSam INPUT={bwa_mem_out} OUTPUT={out_file} SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [6]:
def gatk_metrics(gatk_out):
    gatk_file=gatk_out.split('.sam')[0]
    out_file=gatk_file+'_marked.bam'
    metrics_file=gatk_file+'_metrics.txt'
    try:
        cmd=f"java -Xmx64G -jar {GATK_JAR} MarkDuplicates INPUT={gatk_out} OUTPUT={out_file} METRICS_FILE={metrics_file} ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [7]:
def index_bam(gatk_bam_file):
    try:
        cmd=f"{SAMTOOLS_PATH}samtools index {gatk_bam_file}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [8]:
def remove_file(file_to_del):
    cmd = f"rm {file_to_del}"
    subprocess.check_call(cmd, shell=True)

In [9]:
def run_bamstats(bam_file, stats_out):
    cmd=f'java -jar {BAMSTATS_JAR} -i {bam_file} -m -q -o {stats_out}'
    subprocess.check_call(cmd, shell=True)

In [10]:
def create_bed(fasta_file, bed_file):
    cmd=f'faidx --transform bed {fasta_file} > {bed_file}'
    subprocess.check_call(cmd, shell=True)

In [11]:
def prep_bed():
    fasta_file=REF_PATH+REF_NAME
    bed_file=REF_PATH+'bed/'+REF_NAME.split('.fa')[0]+'.bed'
    pathlib.Path(REF_PATH+'bed/').mkdir(exist_ok=True)
    create_bed(fasta_file, bed_file)
    return bed_file

In [12]:
def run_bamdst(bed_file, out_path, bam_in):
    cmd=f'{BAMDST_PATH}bamdst -p {bed_file} -o {out_path} {bam_in}'
    subprocess.check_call(cmd, shell=True)

In [13]:
def workflow(bed_file=None):
    for sra in SRAs:
        print(f'{sra}')
        if DE_NOVO=='MEGAHIT':
            data_path=PRJ_OUT_PATH+sra+f'/{CONTIGS_BASE_FOLDER}/'
            #data_path=PRJ_OUT_PATH+sra+f'/{contigs_base_folder}/intermediate_contigs/'
            contigs=data_path+f'{CONTIG_MER}.contigs.fq'
            
            contigsfa=data_path+f'{CONTIG_MER}.contigs.fa'
        elif DE_NOVO=='coronaspades':
            data_path=PRJ_OUT_PATH+sra+'/coronaspades_default/'
            contigs=data_path+f'{coronaspades_name}.fq'
            contigsfa=data_path+f'{coronaspades_name}.fasta'
        if not os.path.isfile(contigsfa):
            print(f'Contigs file: {contigsfa} doesnt exist')
            raise Exception
        if not os.path.isfile(contigs):
            create_fq(data_path) 
        bwa_path=PRJ_OUT_PATH+sra+f'/{ALIGNER}/'
        if not os.path.isfile(bwa_path):
            pathlib.Path(bwa_path).mkdir(exist_ok=True)
        pathlib.Path(bwa_path).mkdir(exist_ok=True)
        bwa_mem_out=run_bwa(contigs, bwa_path, sra)
        
        gatk_out=gatk_sort(bwa_mem_out)
        gatk_bam_file=gatk_metrics(gatk_out)
        index_bam(gatk_bam_file)
        remove_file(bwa_mem_out)
        remove_file(gatk_out)
        
        out_path=bwa_path+'bamstats/'
        if not os.path.isfile(out_path):
            pathlib.Path(out_path).mkdir(exist_ok=True)
        stats_out=out_path+sra+BAMSTATS_POST
        run_bamstats(gatk_bam_file, stats_out)
        if bed_file:
            out_path=bwa_path+'bamdst/'
            if not os.path.isfile(out_path):
                pathlib.Path(out_path).mkdir(exist_ok=True)
            run_bamdst(bed_file, out_path, gatk_bam_file)

In [14]:
BWA_MEM="~/apps/bwa-0.7.17/bwa"
BBMAP_PATH='~/apps/bbmap/'
GATK_JAR='~/apps/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar'
SAMTOOLS_PATH=''
BAMSTATS_JAR='~/apps/BAMStats-1.25/BAMStats-1.25.jar'
BAMDST_PATH='/mnt/1TB_0/Data/Code/external/bamdst/'
BEDTOOLS_PATH='~/apps/bedtools/'
GATK_TEMP='/mnt/1TB_0/temp/gatk'

In [15]:
ALIGNER='bwa_mem'

In [16]:
PRJ='PRJCA002517'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'


REF_NAME='YNU_ManJav_2.0.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fna')[0]

NUM_THREADS=32
DESCRIPTION='_default'
OPTIONS=""
ALIGN_OPTS_NAME='default'

RUN_FASTP=False

BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'
BAM_POSTFIX='gatk_sorted_marked'
UNIVEC_ALIGNED=False

DE_NOVO='MEGAHIT'
CONTIG_MER='final_min_len_300'
CONTIGS_BASE_FOLDER='megahit_default'

NUM_THREADS=32

In [17]:
SRAs=['CRR477154','CRR477155','CRR477156','CRR477157']

In [18]:
#bed_file=REF_PATH+'bed/'+REF_NAME.split('.fa')[0]+'.bed'
workflow()

CRR477154
CRR477155
CRR477156
CRR477157


In [19]:
REF_NAME='GRCh38.p13.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fna')[0]

BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'

In [20]:
workflow()

CRR477154
CRR477155
CRR477156
CRR477157


In [21]:
REF_NAME='Vero_WHO_p1.0.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/'
ALIGN_NAME=REF_NAME.split('.fna')[0]

BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'

In [22]:
workflow()

CRR477154
CRR477155
CRR477156
CRR477157
