In [1]:
import pathlib
import subprocess
import os
import os.path

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
### BWA-MEM

In [4]:
def create_index():
    cmd = f"{BWA_MEM} index {REF_PATH+REF_NAME}"
    subprocess.check_call(cmd, shell=True)

In [5]:
def run_fastp(in_1, in_2, sra):
    in_b_1=in_1.split("_f1_fastp.fq")[0]+'_1.fq'
    in_b_2=in_2.split("_r2_fastp.fq")[0]+'_2.fq'
    out_1=in_1
    out_2=in_2
    qc_file=in_1.split("_f1_fastp.fq")[0]+'_fastp_qc'
    cmd = f"fastp -i {in_b_1} -o {out_1} -I {in_b_2}  -O {out_2} -j {qc_file}.json -h {qc_file}.html -R {sra}"
    subprocess.check_call(cmd, shell=True)
    return out_1, out_2

In [6]:
def create_fq(contigs_path):
    try:
        cmd= f"{BBMAP_PATH}reformat.sh in={contigs_path}{contig_mer}.contigs.fa out={contigs_path}{contig_mer}.contigs.fq"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [7]:
def run_bwa(bwa_path, sra, fastq1, fastq2, description="", options=""):
    ref=REF_NAME.split('.fa')[0]
    out_file= f"{bwa_path}{sra}_reads_fastp_{ref}{description}_{ALIGNER}.sam"
    try:
        cmd = f"{BWA_MEM} mem {REF_PATH+REF_NAME} {fastq1} {fastq2} {options} > {out_file} -t {NUM_THREADS}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [8]:
def run_bwa_single(bwa_path, sra, fastq1, description="", options=""):
    ref=REF_NAME.split('.fa')[0]
    out_file= f"{bwa_path}{sra}_reads_fastp_{ref}{description}_{ALIGNER}.sam"
    try:
        cmd = f"{BWA_MEM} mem {REF_PATH+REF_NAME} {fastq1} {options} > {out_file} -t {NUM_THREADS}"
        print(cmd)
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [9]:
def gatk_sort(bwa_mem_out):
    out_file=bwa_mem_out.split('.sam')[0]
    out_file=out_file+'_gatk_sorted.sam'
    try:
        cmd=f"java -jar {GATK_JAR}  SortSam INPUT={bwa_mem_out} OUTPUT={out_file} SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT TMP_DIR={GATK_TEMP}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [10]:
def gatk_metrics(gatk_out):
    gatk_file=gatk_out.split('.sam')[0]
    out_file=gatk_file+'_marked.bam'
    metrics_file=gatk_file+'_metrics.txt'
    try:
        cmd=f"java -Xmx64G -jar {GATK_JAR}  MarkDuplicates INPUT={gatk_out} OUTPUT={out_file} METRICS_FILE={metrics_file} ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT TMP_DIR={GATK_TEMP}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [11]:
def index_bam(gatk_bam_file):
    try:
        cmd=f"{SAMTOOLS_PATH}samtools index {gatk_bam_file}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [12]:
def remove_file(file_to_del):
    cmd = f"rm {file_to_del}"
    subprocess.check_call(cmd, shell=True)

In [13]:
def run_bamstats(bam_file, stats_out):
    try:
        cmd=f'java -Xmx48g -Djava.awt.headless=true -jar {BAMSTATS_JAR} -i {bam_file} -m -q -o {stats_out}'
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [14]:
def run_bamdst(bed_file, out_path, bam_in):
    try:
        cmd=f'{BAMDST_PATH}bamdst -p {bed_file} -o {out_path} {bam_in}'
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [15]:
def create_bed(fasta_file, bed_file):
    cmd=f'faidx --transform bed {fasta_file} > {bed_file}'
    subprocess.check_call(cmd, shell=True)

In [16]:
def run_idxstats(gatk_bam_file, flagstat_file):
    cmd=f"{SAMTOOLS_PATH}samtools idxstats {gatk_bam_file} >{flagstat_file}"
    try: 
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [17]:
def bam_to_bed(gatk_bam_file):
    bed_file=gatk_bam_file.split('.bam')[0]+'.bed'
    cmd=f'{BEDTOOLS_PATH}bedtools bamtobed -i {gatk_bam_file} >{bed_file}'
    subprocess.check_call(cmd, shell=True)

In [18]:
def run_flagtools(gatk_bam_file):
    bam_path_root=gatk_bam_file.split('.bam')[0]
    flagstat_file=bam_path_root+'_flagtools.txt'
    cmd=f'{SAMTOOLS_PATH}samtools flagstat {gatk_bam_file} >{flagstat_file}'
    subprocess.check_call(cmd, shell=True)

In [19]:
def run_samtools_stats(gatk_bam_file, ref_genome):
    bam_path_root=gatk_bam_file.split('.bam')[0]
    sam_out=bam_path_root+'_samtools_stats.txt'
    cmd=f'{SAMTOOLS_PATH}samtools stats {gatk_bam_file} --reference {ref_genome} >{sam_out}'
    subprocess.check_call(cmd, shell=True)

In [20]:
def prep_bed(create=False):
    fasta_file=REF_PATH+REF_NAME
    bed_file=REF_PATH+'bed/'+REF_NAME.split('.fa')[0]+'.bed'
    pathlib.Path(REF_PATH+'bed/').mkdir(exist_ok=True)
    if create:
        create_bed(fasta_file, bed_file)
    return bed_file

In [21]:
def bam_to_fastq():
    ref=REF_NAME.split('.fa')[0]
    for idx, sra in enumerate(SRAs):
        bwa_path=PRJ_OUT_PATH+sra+'/bowtie2/'
        #SRR10168373_reads_mitochondria_mamals_1_local_G_298_0_bowtie2_gatk_sorted_marked.bam
        bam_file= f"{bwa_path}{sra}_reads_{ref}_{ALIGN_OPTS_NAME}_{ALIGNER}_gatk_sorted_marked.bam"
        fastq_file= f"{bwa_path}{sra}_reads_{ref}_{ALIGN_OPTS_NAME}_{ALIGNER}_gatk_sorted_marked_bedtools.fq"
        print(fastq_file)
        cmd=f'~/apps/bedtools/bedtools bamtofastq -i {bam_file} -fq {fastq_file}'
        subprocess.check_call(cmd, shell=True)

In [22]:
def align_idx_bamdst(idxstat_file, bamdst_report):
    f_idx = open(idxstat_file, mode='r')
    idx_lines = f_idx.readlines()
    f_idx.close()
    f_bamdst = open(bamdst_report, mode='r')
    bamdst_lines = f_bamdst.readlines()
    f_bamdst.close()
    fixed_lines=[]
    for count, b_line in enumerate(bamdst_lines):
        b_words = b_line.split('\t')
        if count==0:
            b_words[0]='Accession'
            b_words.insert(1,'ref_length')
            b_words.insert(2,'N')
            new_line='\t'.join(b_words)
            fixed_lines.append(new_line)
        else:
            if float(b_words[1])>0.0 or float(b_words[4])>0.0:
                for i_line in idx_lines:
                    i_words = i_line.split('\t')
                    if b_words[0].strip() == i_words[0].strip():
                        b_words.insert(1,i_words[1])
                        b_words.insert(2,i_words[2])
                if UNIVEC_ALIGNED:
                    b_words[0]=b_words[0].replace('gnl_pipe_uv_pipe_','')
                    b_words[0]=b_words[0].replace('_colon_',':')
                new_line='\t'.join(b_words)
                fixed_lines.append(new_line)
    new_report=bamdst_report.split('.txt')[0]+'_N.txt'
    f = open(new_report, "w")
    f.writelines(fixed_lines)
    f.close()
    return new_report

In [23]:
def workflow(bed_file=None, sra_dir=False):
    for idx, sra in enumerate(SRAs):
        fastq1=FASTQ_FILES[2*idx]
        fastq2=FASTQ_FILES[(2*idx)+1]
        print(f'{sra}')
        if not os.path.isfile(PRJ_OUT_PATH):
            pathlib.Path(PRJ_OUT_PATH).mkdir(exist_ok=True)
        bwa_path=PRJ_OUT_PATH+sra
        if not os.path.isfile(bwa_path):
            pathlib.Path(bwa_path).mkdir(exist_ok=True)
        bwa_path=PRJ_OUT_PATH+sra+f'/{ALIGNER}/'
        if not os.path.isfile(bwa_path):
            pathlib.Path(bwa_path).mkdir(exist_ok=True)
        if sra_dir:
            f1path=BASE_PATH+sra+'/'+fastq1
            f2path=BASE_PATH+sra+'/'+fastq2
        else:
            f1path=BASE_PATH+fastq1
            f2path=BASE_PATH+fastq2
        if not os.path.isfile(f1path):
            if RUN_FASTP:
                run_fastp(f1path, f2path, sra) 
            else:
                print(f'check file exists: {f1path}')
                raise Exception
        bwa_mem_out=run_bwa(bwa_path, sra, f1path, f2path, description=DESCRIPTION, options=OPTIONS)
        try:
            gatk_out=gatk_sort(bwa_mem_out)
            gatk_bam_file=gatk_metrics(gatk_out)
            index_bam(gatk_bam_file)
            remove_file(bwa_mem_out)
            remove_file(gatk_out)

            out_path=bwa_path+f'bamstats_{ALIGN_NAME}/'
            if not os.path.isfile(out_path):
                pathlib.Path(out_path).mkdir(exist_ok=True)
            stats_out=out_path+sra+BAMSTATS_POST
            run_bamstats(gatk_bam_file, stats_out)
            if bed_file:
                out_path=bwa_path+f'bamdst_{ALIGN_NAME}/'
                if not os.path.isfile(out_path):
                    pathlib.Path(out_path).mkdir(exist_ok=True)
                run_bamdst(bed_file, out_path, gatk_bam_file)
        except Exception as e:
            print(e)

In [24]:
def workflow_single(bed_file=None):
    for idx, sra in enumerate(SRAsingle):
        fastq1=FASTQ_FILES_SINGLE[idx]
        print(f'{sra}')
        if not os.path.isfile(PRJ_OUT_PATH):
            pathlib.Path(PRJ_OUT_PATH).mkdir(exist_ok=True)
        bwa_path=PRJ_OUT_PATH+sra
        if not os.path.isfile(bwa_path):
            pathlib.Path(bwa_path).mkdir(exist_ok=True)
        bwa_path=PRJ_OUT_PATH+sra+f'/{ALIGNER}/'
        if not os.path.isfile(bwa_path):
            pathlib.Path(bwa_path).mkdir(exist_ok=True)
        #f1path=BASE_PATH+sra+'/'+fastq1
        f1path=BASE_PATH+fastq1
        if not os.path.isfile(f1path):
            print(f'error no file: {f1path}')
            #run_fastp(f1path, f2path, sra) 
        bwa_mem_out=run_bwa_single(bwa_path, sra, f1path, description=DESCRIPTION, options=OPTIONS)
        gatk_out=gatk_sort(bwa_mem_out)
        gatk_bam_file=gatk_metrics(gatk_out)
        index_bam(gatk_bam_file)
        remove_file(bwa_mem_out)
        remove_file(gatk_out)
        
        run_samtools_stats(gatk_bam_file, REF_PATH+REF_NAME)
        run_flagtools(gatk_bam_file)
        #bam_to_bed(gatk_bam_file)
        idxstat_file=gatk_bam_file.split(f'{BAM_POSTFIX}.bam')[0]+'idxstats.txt'
        run_idxstats(gatk_bam_file, idxstat_file)
        
        out_path=bwa_path+f'bamstats_reads_{ALIGN_NAME}_{ALIGN_OPTS_NAME}/'
        if not os.path.isfile(out_path):
            pathlib.Path(out_path).mkdir(exist_ok=True)
        stats_out=out_path+sra+BAMSTATS_POST
        run_bamstats(gatk_bam_file, stats_out)
        if bed_file:
            out_path=bwa_path+f'bamdst_reads_{ALIGN_NAME}_{ALIGN_OPTS_NAME}/'
            if not os.path.isfile(out_path):
                pathlib.Path(out_path).mkdir(exist_ok=True)
            run_bamdst(bed_file, out_path, gatk_bam_file)
            bamdst_data=out_path+'chromosomes.report'
            aligned_file=align_idx_bamdst(idxstat_file, bamdst_data)

In [25]:
BWA_MEM="~/apps/bwa-0.7.17/bwa"
BBMAP_PATH='~/apps/bbmap/'
GATK_JAR='~/apps/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar'
SAMTOOLS_PATH=''
BAMSTATS_JAR='~/apps/BAMStats-1.25/BAMStats-1.25.jar'
BAMDST_PATH='/mnt/1TB_0/Data/Code/external/bamdst/'
BEDTOOLS_PATH='~/apps/bedtools/'
GATK_TEMP='/mnt/1TB_0/temp/gatk'

In [26]:
PRJ='PRJCA002517'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_2/Data/bigd/{PRJ}/'

REF_NAME='YNU_ManJav_2.0.fna'
REF_PATH='/mnt/1TB_0/Data/Code/code/PRJNA901878/'
ALIGN_NAME=REF_NAME.split('.fna')[0]

NUM_THREADS=32
DESCRIPTION='_default'
OPTIONS=""
ALIGN_OPTS_NAME='default'

RUN_FASTP=False

BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'
BAM_POSTFIX='gatk_sorted_marked'
UNIVEC_ALIGNED=False
ALIGNER='bwamem'

In [27]:

SRAs=['CRR477154','CRR477155','CRR477156','CRR477157']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_f1_fastp.fq')
    FASTQ_FILES.append(f+'_r2_fastp.fq')

In [28]:
#SRAsingle=['SRR11093270','SRR11093271']
#SRAsingle=['journal_ppat_1009664_s002']

#FASTQ_FILES_SINGLE=[]
#for f in SRAsingle:
#    FASTQ_FILES_SINGLE.append(f+'.fq')
#FASTQ_FILES_SINGLE=['GD-P2S_NDX550382_RUO.fq', 'GD-P2S_NB501248AR.fq']
#FASTQ_FILES_SINGLE=['GD-P2S.fq']

In [29]:
#assert len(FASTQ_FILES)==2*len(SRAs)
create_index()

In [30]:

#bed_file=prep_bed(create=True)
workflow()

CRR477154
CRR477155
CRR477156
CRR477157


In [31]:
PRJ='PRJCA002517'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_2/Data/bigd/{PRJ}/'

REF_NAME='Vero_WHO_p1.0.fna'
REF_PATH='/mnt/1TB_0/Data/Code/code/PRJNA901878/'
ALIGN_NAME=REF_NAME.split('.fna')[0]

NUM_THREADS=32
DESCRIPTION='_default'
OPTIONS=""
ALIGN_OPTS_NAME='default'

RUN_FASTP=False

BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'
BAM_POSTFIX='gatk_sorted_marked'
UNIVEC_ALIGNED=False
ALIGNER='bwamem'

In [32]:
create_index()
#bed_file=prep_bed(create=True)
workflow()

CRR477154
CRR477155
CRR477156
CRR477157


In [33]:
PRJ='PRJCA002517'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_2/Data/bigd/{PRJ}/'

REF_NAME='GRCh38.p13.fna'
REF_PATH='/mnt/1TB_0/Data/Code/code/PRJNA901878/'
ALIGN_NAME=REF_NAME.split('.fna')[0]

NUM_THREADS=32
DESCRIPTION='_default'
OPTIONS=""
ALIGN_OPTS_NAME='default'

RUN_FASTP=False

BAMSTATS_POST=f'_{ALIGN_NAME}_bamstats.txt'
BAM_POSTFIX='gatk_sorted_marked'
UNIVEC_ALIGNED=False
ALIGNER='bwamem'

In [34]:
create_index()
#bed_file=prep_bed(create=True)
workflow()

CRR477154
CRR477155
CRR477156
CRR477157
