In [31]:
import pathlib
import subprocess
import os
import os.path

In [32]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [38]:
### BWA-MEM

In [39]:

BWA_MEM="~/apps/bwa-0.7.17/bwa"
#BWA_MEM_OUT=PRJ_OUT_PATH+'bwa_mem2/'


In [40]:
def create_index():
    cmd = f"{BWA_MEM} index {REF_PATH+REF_NAME}"
    subprocess.check_call(cmd, shell=True)

In [41]:
def run_fastp(in_1, in_2, sra):
    in_b_1=in_1.split("_f1_fastp.fq")[0]+'_1.fq'
    in_b_2=in_2.split("_r2_fastp.fq")[0]+'_2.fq'
    out_1=in_1
    out_2=in_2
    qc_file=in_1.split("_f1_fastp.fq")[0]+'_fastp_qc'
    cmd = f"fastp -i {in_b_1} -o {out_1} -I {in_b_2}  -O {out_2} -j {qc_file}.json -h {qc_file}.html -R {sra}"
    subprocess.check_call(cmd, shell=True)
    return out_1, out_2

In [42]:
def create_fq(contigs_path):
    try:
        cmd= f"~/apps/bbmap/reformat.sh in={contigs_path}{contig_mer}.contigs.fa out={contigs_path}{contig_mer}.contigs.fq"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [43]:
def run_bwa(bwa_path, sra, fastq1, fastq2, description="", options=""):
    ref=REF_NAME.split('.fa')[0]
    out_file= f"{bwa_path}{sra}_reads_fastp_{ref}{description}_bwamem.sam"
    try:
        cmd = f"{BWA_MEM} mem {REF_PATH+REF_NAME} {fastq1} {fastq2} {options} > {out_file} -t {NUM_THREADS}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [44]:
def run_bwa_single(bwa_path, sra, fastq1, description="", options=""):
    ref=REF_NAME.split('.fa')[0]
    out_file= f"{bwa_path}{sra}_reads_fastp_{ref}_bwamem.sam"
    try:
        cmd = f"{BWA_MEM} mem {REF_PATH+REF_NAME} {fastq1} {options} > {out_file} -t {NUM_THREADS}"
        subprocess.check_call(cmd, shell=True)
        print(cmd)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [45]:
def gatk_sort(bwa_mem_out):
    out_file=bwa_mem_out.split('.sam')[0]
    out_file=out_file+'_gatk_sorted.sam'
    try:
        cmd=f"java -jar ~/apps/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar  SortSam INPUT={bwa_mem_out} OUTPUT={out_file} SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [46]:
def gatk_metrics(gatk_out):
    gatk_file=gatk_out.split('.sam')[0]
    out_file=gatk_file+'_marked.bam'
    metrics_file=gatk_file+'_metrics.txt'
    try:
        cmd=f"java -Xmx64G -jar ~/apps/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar  MarkDuplicates INPUT={gatk_out} OUTPUT={out_file} METRICS_FILE={metrics_file} ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [47]:
def index_bam(gatk_bam_file):
    try:
        cmd=f"~/apps/samtools-1.11/samtools index {gatk_bam_file}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [48]:
def remove_file(file_to_del):
    cmd = f"rm {file_to_del}"
    subprocess.check_call(cmd, shell=True)

In [49]:
def run_bamstats(bam_file, stats_out):
    cmd=f'java -jar ~/apps/BAMStats-1.25/BAMStats-1.25.jar -i {bam_file} -m -q -o {stats_out}'
    subprocess.check_call(cmd, shell=True)

In [50]:
def create_bed(fasta_file, bed_file):
    cmd=f'faidx --transform bed {fasta_file} > {bed_file}'
    subprocess.check_call(cmd, shell=True)

In [51]:
def prep_bed():
    fasta_file=REF_PATH+REF_NAME
    bed_file=REF_PATH+'bed/'+REF_NAME.split('.fa')[0]+'.bed'
    pathlib.Path(REF_PATH+'bed/').mkdir(exist_ok=True)
    create_bed(fasta_file, bed_file)
    return bed_file

In [52]:
def bam_to_bed(gatk_bam_file):
    bed_file=gatk_bam_file.split('.bam')[0]+'.bed'
    cmd=f'~/apps/bedtools/bedtools bamtobed -i {gatk_bam_file} >{bed_file}'
    subprocess.check_call(cmd, shell=True)

In [53]:
def run_bamdst(bed_file, out_path, bam_in):
    cmd=f'/mnt/1TB_0/Data/Code/external/bamdst/bamdst -p {bed_file} -o {out_path} {bam_in}'
    subprocess.check_call(cmd, shell=True)

In [54]:
def run_samtools_stats(gatk_bam_file, ref_genome):
    bam_path_root=gatk_bam_file.split('.bam')[0]
    sam_out=bam_path_root+'_samtools_stats.txt'
    cmd=f'~/apps/samtools-1.11/samtools stats {gatk_bam_file} --reference {ref_genome} >{sam_out}'
    subprocess.check_call(cmd, shell=True)

In [55]:
def run_flagtools(gatk_bam_file):
    bam_path_root=gatk_bam_file.split('.bam')[0]
    flagstat_file=bam_path_root+'_flagtools.txt'
    cmd=f'~/apps/samtools-1.11/samtools flagstat {gatk_bam_file} >{flagstat_file}'
    subprocess.check_call(cmd, shell=True)

In [56]:
def workflow(bed_file=None):
    for idx, sra in enumerate(SRAs):
        fastq1=FASTQ_FILES[2*idx]
        fastq2=FASTQ_FILES[(2*idx)+1]
        print(f'{sra}')
        if not os.path.isfile(PRJ_OUT_PATH):
            pathlib.Path(PRJ_OUT_PATH).mkdir(exist_ok=True)
        bwa_path=PRJ_OUT_PATH+sra
        if not os.path.isfile(bwa_path):
            pathlib.Path(bwa_path).mkdir(exist_ok=True)
        bwa_path=PRJ_OUT_PATH+sra+'/bwa_mem/'
        if not os.path.isfile(bwa_path):
            pathlib.Path(bwa_path).mkdir(exist_ok=True)
        #f1path=BASE_PATH+sra+'/'+fastq1
        #f2path=BASE_PATH+sra+'/'+fastq2
        f1path=BASE_PATH+fastq1
        f2path=BASE_PATH+fastq2
        if not os.path.isfile(f1path):
            if RUN_FASTP:
                run_fastp(f1path, f2path, sra)
            else:
                print(f'check file exists: {f1path}')
        bwa_mem_out=run_bwa(bwa_path, sra, f1path, f2path, description=DESCRIPTION, options=OPTIONS)
        gatk_out=gatk_sort(bwa_mem_out)
        gatk_bam_file=gatk_metrics(gatk_out)
        index_bam(gatk_bam_file)
        remove_file(bwa_mem_out)
        remove_file(gatk_out)
        
        run_samtools_stats(gatk_bam_file, REF_PATH+REF_NAME)
        run_flagtools(gatk_bam_file)
        bam_to_bed(gatk_bam_file)
        out_path=bwa_path+f'bamstats_{ALIGN_NAME}/'
        if not os.path.isfile(out_path):
            pathlib.Path(out_path).mkdir(exist_ok=True)
        stats_out=out_path+sra+BAMSTATS_POST
        run_bamstats(gatk_bam_file, stats_out)
        if bed_file:
            out_path=bwa_path+f'bamdst_{ALIGN_NAME}/'
            if not os.path.isfile(out_path):
                pathlib.Path(out_path).mkdir(exist_ok=True)
            run_bamdst(bed_file, out_path, gatk_bam_file)

In [57]:
def workflow_single(bed_file=None):
    for idx, sra in enumerate(SRAsingle):
        fastq1=FASTQ_FILES_SINGLE[idx]
        print(f'{sra}')
        if not os.path.isfile(PRJ_OUT_PATH):
            pathlib.Path(PRJ_OUT_PATH).mkdir(exist_ok=True)
        bwa_path=PRJ_OUT_PATH+sra
        if not os.path.isfile(bwa_path):
            pathlib.Path(bwa_path).mkdir(exist_ok=True)
        bwa_path=PRJ_OUT_PATH+sra+'/bwa_mem/'
        if not os.path.isfile(bwa_path):
            pathlib.Path(bwa_path).mkdir(exist_ok=True)
        #f1path=BASE_PATH+sra+'/'+fastq1
        f1path=BASE_PATH+fastq1
        if not os.path.isfile(f1path):
            print(f'error no file: {f1path}')
            #run_fastp(f1path, f2path, sra) 
        bwa_mem_out=run_bwa_single(bwa_path, sra, f1path, description=DESCRIPTION, options=OPTIONS)
        gatk_out=gatk_sort(bwa_mem_out)
        gatk_bam_file=gatk_metrics(gatk_out)
        index_bam(gatk_bam_file)
        remove_file(bwa_mem_out)
        remove_file(gatk_out)
        
        out_path=bwa_path+'bamstats/'
        if not os.path.isfile(out_path):
            pathlib.Path(out_path).mkdir(exist_ok=True)
        stats_out=out_path+sra+BAMSTATS_POST
        run_bamstats(gatk_bam_file, stats_out)
        if bed_file:
            out_path=bwa_path+'bamdst/'
            if not os.path.isfile(out_path):
                pathlib.Path(out_path).mkdir(exist_ok=True)
            run_bamdst(bed_file, out_path, gatk_bam_file)

In [33]:
PRJ='PRJNA795267'
PRJ_OUT_PATH=f'/mnt/1TB_0/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/6TB_0/Data/genbank/{PRJ}/'

REF_NAME='PRJNA795267_fast_SARS.fa'
REF_PATH='/mnt/1TB_0/Data/fasta/combined/indexed/'

#REF_NAME='GCF_000001405.39_GRCh38.p13_genomic.fna'
#REF_PATH='/mnt/1TB_0/Data/fasta/combined/indexed/'
#REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/GRCH38/ncbi-genomes-2021-06-12/'
NUM_THREADS=32
#DESCRIPTION='_soft_clip'
#OPTIONS="-Y"
#DESCRIPTION='_default'
#http://bio-bwa.sourceforge.net/bwa.shtml#3
#OPTIONS="-Y -B 1 -a -k 13"
OPTIONS=""
RUN_FASTP=False
ALIGN_NAME=REF_NAME.split('.fa')[0]
BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'

In [34]:
#done: 'SRR10168374'
#SRAs=['SRR10168373', 'SRR10168375', 'SRR10168376','SRR10168377', 'SRR10168378',\
#      'SRR10168379','SRR10168380','SRR10168381','SRR10168382','SRR10168383','SRR10168384', \
#      'SRR10168386','SRR10168387','SRR10168388','SRR10168389','SRR10168390',\
#     'SRR10168385','SRR10168391','SRR10168392','SRR10168393']

SRAs=['SRR10168374','SRR10168375','SRR10168376','SRR10168377','SRR10168378']
#SRAsingle=['journal_ppat_1009664_s002']

In [35]:
FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fq')
    FASTQ_FILES.append(f+'_2_fastp.fq')
    
#FASTQ_FILES=['MGISEQ_Merge_L04_1_fastp.fq','MGISEQ_Merge_L04_2_fastp.fq', 'SRR11092059_1_fastp.fq','SRR11092059_2_fastp.fq',\
#             'SRR11092060_1_fastp.fq','SRR11092060_2_fastp.fq','SRR11092061_1_fastp.fq','SRR11092061_2_fastp.fq',\
#             'SRR11092062L04_1_fastp.fq','SRR11092062L04_2_fastp.fq','SRR11092063L04_1_fastp.fq','SRR11092063L04_2_fastp.fq']    

In [36]:
#FASTQ_FILES_SINGLE=['journal_ppat_1009664_s002.fq']

In [37]:
#assert len(FASTQ_FILES)==2*len(SRAs)

In [58]:
bed_file=REF_PATH+'bed/'+REF_NAME.split('.fa')[0]+'.bed'
workflow(bed_file)

SRR10168374
SRR10168375
SRR10168376
SRR10168377
SRR10168378


### Human

In [35]:
REF_NAME='GCF_000001405.39_GRCh38.p13_genomic.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/GRCH38/ncbi-genomes-2021-06-12/'
ALIGN_NAME=REF_NAME.split('.fa')[0]
BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'
NUM_THREADS=32
SRAs=['SRR10168374']
#bed_file=REF_PATH+'bed/'+REF_NAME.split('.fa')[0]+'.bed'
#create_bed(REF_PATH+REF_NAME, bed_file)
#create_index()
workflow()

SRR10168374


### Mouse

In [29]:
REF_NAME='GCF_000001635.27_GRCm39_genomic.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/GRCm39/'
ALIGN_NAME=REF_NAME.split('.fa')[0]
BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'

In [30]:
#create_index()
workflow()

SRR10168373
SRR10168375
SRR10168376
SRR10168377
SRR10168378
SRR10168379
SRR10168380
SRR10168381
SRR10168382
SRR10168383
SRR10168384
SRR10168386
SRR10168387
SRR10168388
SRR10168389
SRR10168390
SRR10168385
SRR10168391
SRR10168392
SRR10168393


### ManJav

In [31]:
REF_NAME='GCF_014570535.1_YNU_ManJav_2.0_genomic.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/YNU_ManJav_2.0/ncbi-genomes-2021-10-24/'
ALIGN_NAME=REF_NAME.split('.fa')[0]
BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'

In [32]:
#create_index()
workflow()

SRR10168373
SRR10168375
SRR10168376
SRR10168377
SRR10168378
SRR10168379
SRR10168380
SRR10168381
SRR10168382
SRR10168383
SRR10168384
SRR10168386
SRR10168387
SRR10168388
SRR10168389
SRR10168390
SRR10168385
SRR10168391
SRR10168392
SRR10168393


In [33]:
SRAs=['SRR10168374']
workflow()

SRR10168374


In [34]:
#temp
REF_NAME='GCF_000001635.27_GRCm39_genomic.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/GRCm39/'
ALIGN_NAME=REF_NAME.split('.fa')[0]
BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'
SRAs=['SRR10168374']
workflow()

SRR10168374
