In [69]:
import pathlib
import subprocess
import os
import pandas as pd
from Bio import Entrez
from Bio import SeqIO

De novo assembled contig alignmnet using Minimap2

https://lh3.github.io/minimap2/

Used for Concatref and aligned inputs to XenoFilteR workflows

In [70]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [71]:
def log_cmd(log_file, cmd, file_opt='a'):
    try:
        f = open(log_file, file_opt)
        f.write(cmd)
        f.write('\n')
        f.close()
    except Exception as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [72]:
def create_fq(contigs_path):
    try:
        if de_novo=='MEGAHIT':
            cmd= f"{BBMAP_PATH}reformat.sh in={contigs_path}{CONTIG_MER}.contigs.fa out={contigs_path}{CONTIG_MER}.contigs.fq"
        elif de_novo=='coronaspades':
            cmd= f"{BBMAP_PATH}reformat.sh in={contigs_path}{coronaspades_name}.fasta out={contigs_path}{coronaspades_name}.fq"
        elif de_novo=='spades':
            cmd= f"{BBMAP_PATH}reformat.sh in={contigs_path}{spades_name}.fasta out={contigs_path}{spades_name}.fq"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [73]:
def run_minimap2(contigs, bwa_path, sra):
    ref=REF_NAME.split(REF_EXT)[0]
    if de_novo=='MEGAHIT':
        out_file= f"{bwa_path}{sra}_{PREPROC}_{CONTIGS_NAME}_{CONTIG_MER}_{ref}_{ALIGN_OPTS_NAME}_{MINI_OPTS_NAME}_minimap2.sam"
    elif de_novo=='coronaspades':
        out_file= f"{bwa_path}{sra}_{PREPROC}_coronaspades_{coronaspades_code}_{ref}_{MINI_OPTS_NAME}_minimap2.sam"
    elif de_novo=='spades':
        out_file= f"{bwa_path}{sra}_{PREPROC}_spades_{spades_code}_{ref}_{MINI_OPTS_NAME}_minimap2.sam"
    try:
        cmd = f"{MINIMAP2_PATH}minimap2 -a {REF_PATH+REF_NAME} {contigs} > {out_file} {MINIMAP_PARAMS} "
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    log_file=out_file.split('.sam')[0]+'_log.txt'
    log_cmd(log_file, cmd, file_opt='w')
    return out_file

In [74]:
def samtools_sort(file_to_sort, out_type='BAM'):
    out_file=file_to_sort.split('.sam')[0]
    if out_type=='BAM':
        out_file=out_file+'_sorted.bam'
    else:
        out_file=out_file+'_sorted.sam'
    cmd=f'{SAMTOOLS_PATH}samtools sort -O {out_type} -o {out_file} {file_to_sort}'
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [75]:
def run_samtools_stats(gatk_bam_file, ref_genome):
    bam_path_root=gatk_bam_file.split('.bam')[0]
    sam_out=bam_path_root+'_samtools_stats.txt'
    cmd=f'{SAMTOOLS_PATH}samtools stats {gatk_bam_file} --reference {ref_genome} >{sam_out}'
    subprocess.check_call(cmd, shell=True)

In [76]:
def samtools_coverage(sam_input, in_type='.sam'):
    f_input=sam_input.split(in_type)[0]
    out_file=f_input+'_coverage.out'
    try:
        cmd=f"{SAMTOOLS_PATH}samtools coverage {sam_input} -o {out_file}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [77]:
def gatk_sort(bwa_mem_out):
    out_file=bwa_mem_out.split('.sam')[0]
    out_file=out_file+'_gatk_sorted.sam'
    try:
        cmd=f"java -jar {GATK_JAR} SortSam INPUT={bwa_mem_out} OUTPUT={out_file} SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [78]:
def gatk_metrics(gatk_out):
    gatk_file=gatk_out.split('.sam')[0]
    out_file=gatk_file+'_marked.bam'
    metrics_file=gatk_file+'_metrics.txt'
    try:
        cmd=f"java -Xmx64G -jar {GATK_JAR} MarkDuplicates INPUT={gatk_out} OUTPUT={out_file} METRICS_FILE={metrics_file} ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [79]:
def index_bam(gatk_bam_file):
    try:
        cmd=f"{SAMTOOLS_PATH}samtools index {gatk_bam_file}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [80]:
def remove_file(file_to_del):
    try:
        cmd = f"rm {file_to_del}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print(e)

In [81]:
def run_bamstats(bam_file, stats_out):
    try:
        cmd=f'java -Xmx48g -Djava.awt.headless=true -jar {BAMSTATS_JAR} -i {bam_file} -m -q -o {stats_out}'
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [82]:
def create_bed(fasta_file, bed_file):
    cmd=f'faidx --transform bed {fasta_file} > {bed_file}'
    subprocess.check_call(cmd, shell=True)

In [83]:
def run_bamdst(bed_file, out_path, bam_in):
    out_path = out_path.rstrip("/")
    try:
        cmd=f'{BAMDST_PATH}bamdst -p {bed_file} -o {out_path} {bam_in}'
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [84]:
def match_taxa(cov_file, taxa_file):
    #writes full description (where has coverage >0) out to file
    df=pd.read_csv(cov_file, sep='\t', header=0)
    df = df.sort_values(by = 'coverage', ascending = False)
    df = df[df['coverage'] > 0]
    f = open(taxa_file,"r")
    taxa_lines = f.readlines()
    accessions=[]
    descrs=[]
    for l in taxa_lines:
        accessions.append(l.split(" ", 1)[0])
        descrs.append(l.split(" ", 1)[1])
    df_taxa = pd.DataFrame(list(zip(accessions, descrs)),
               columns =['#rname', 'description'])
    df_merge = pd.merge(df, df_taxa, on=['#rname'])
    df_merge = df_merge[df_merge['coverage'] > 0]
    f_input=cov_file.split('.out')[0]
    out_file=cov_file+'_taxa.csv'
    df_merge.to_csv(out_file)

In [85]:
def run_flagtools(bam_file):
    bam_path_root=bam_file.split('.bam')[0]
    flagstat_file=bam_path_root+'_flagtools.txt'
    cmd=f'{SAMTOOLS_PATH}samtools flagstat {bam_file} >{flagstat_file}'
    subprocess.check_call(cmd, shell=True)

In [86]:
def run_idxstats(bam_file, flagstat_file):
    cmd=f"{SAMTOOLS_PATH}samtools idxstats {bam_file} >{flagstat_file}"
    try: 
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [87]:
def setup_files(sra):
    if de_novo=='MEGAHIT':
        #data_path=PRJ_OUT_PATH+sra+f'/{contigs_base_folder}/intermediate_contigs/'
        data_path=PRJ_OUT_PATH+sra+f'/{contigs_base_folder}/'
        contigs=data_path+f'{CONTIG_MER}.contigs.fq'
        contigsfa=data_path+f'{CONTIG_MER}.contigs.fa'
    elif de_novo=='coronaspades':
        data_path=PRJ_OUT_PATH+sra+'/coronaspades_default/'
        contigs=data_path+f'{coronaspades_name}.fq'
        contigsfa=data_path+f'{coronaspades_name}.fasta'
    elif de_novo=='spades':
        data_path=PRJ_OUT_PATH+sra+'/spades_careful/'
        contigs=data_path+f'{spades_name}.fq'
        contigsfa=data_path+f'{spades_name}.fasta'
    if not os.path.isfile(contigsfa):
        print(f'Contigs file: {contigsfa} does not exist')
        raise Exception
    if not os.path.isfile(contigs):
        create_fq(data_path) 
    alignment_path=PRJ_OUT_PATH+sra+'/minimap2/'
    if not os.path.isdir(alignment_path):
        pathlib.Path(alignment_path).mkdir(exist_ok=True)
    return contigs, contigsfa, alignment_path

In [88]:
def prep_bed(create=True):
    fasta_file=REF_PATH+REF_NAME
    bed_file=REF_PATH+'bed/'+REF_NAME.split(REF_EXT)[0]+'.bed'
    pathlib.Path(REF_PATH+'bed/').mkdir(exist_ok=True)
    if create:
        create_bed(fasta_file, bed_file)
    return bed_file

In [89]:
def workflow(bed_file=None, run_bamstats=False, run_coverage=False, output_taxa=False):
    for sra in SRAs:
        print(f'{sra}')
        contigs, contigsfa, alignment_path=setup_files(sra)
        sam_out=run_minimap2(contigs, alignment_path, sra)
        
        if SORT_TOOL=='samtools':
            bam_sorted=samtools_sort(sam_out, out_type='BAM')
            remove_file(sam_out)
        else:
            sam_out=gatk_sort(sam_out)
            bam_sorted=gatk_metrics(sam_out)
        index_bam(bam_sorted)
        
        run_samtools_stats(bam_sorted, REF_PATH+REF_NAME)
        run_flagtools(bam_sorted)
        
        idxstat_file=bam_sorted.split(f'.bam')[0]+'_idxstats.txt'
        run_idxstats(bam_sorted, idxstat_file)
        
        if run_bamstats:
            out_path=alignment_path+f'bamstats_reads_{ALIGN_NAME}_{ALIGN_OPTS_NAME}/'
            if not os.path.isfile(out_path):
                pathlib.Path(out_path).mkdir(exist_ok=True)
            bamstats_file=bam_sorted.split('.bam')[0]+'_bamstats.txt'
            run_bamstats(bam_sorted, out_path)
        if run_coverage:
            cov_file=samtools_coverage(bam_sorted, in_type='.bam')
            if output_taxa:
                match_taxa(cov_file, REF_PATH+TAXA_TAB_DEL)
        if bed_file:
            out_path=alignment_path+f'bamdst_{de_novo}_{ALIGN_NAME}_{ALIGN_OPTS_NAME}/'
            if not os.path.isdir(out_path):
                pathlib.Path(out_path).mkdir(exist_ok=True)
            run_bamdst(bed_file, out_path, bam_sorted)
            remove_file(out_path+'depth.tsv.gz')
            remove_file(out_path+'region.tsv.gz')
        

In [90]:
### General settings

In [91]:
MINIMAP2_PATH='~/apps/minimap2-2.24_x64-linux/'
#using conda installed samtools
SAMTOOLS_PATH=''
GATK_JAR='~/apps/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar'
BAMSTATS_JAR='~/apps/BAMStats-1.25/BAMStats-1.25.jar'
BAMDST_PATH='/mnt/1TB_0/Data/Code/external/bamdst/'
BBMAP_PATH='~/apps/bbmap/'
NUM_THREADS=32

In [92]:
#https://lh3.github.io/minimap2/minimap2.html#5 -2 for 2 IO threads, use -I30g for long reference files
MINIMAP_PARAMS=f'-MD -c -eqx -2 -t {NUM_THREADS} -I30g --sam-hit-only --secondary=no'
MINI_OPTS_NAME='secondary_no'

In [93]:
REF_NAME='YNU_ManJav_2.0.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/'
#TAXA_TAB_DEL=''

REF_EXT='.fna'
fasta_file=REF_PATH+REF_NAME
bed_file=REF_PATH+'bed/'+REF_NAME.split(REF_EXT)[0]+'.bed'
ALIGN_NAME=REF_NAME.split(REF_EXT)[0]

de_novo='MEGAHIT'
CONTIG_MER='final_min_len_300'
CONTIGS_NAME=de_novo
contigs_base_folder='megahit_default'
ALIGN_OPTS_NAME='default'

#de_novo='coronaspades'
#contig_mer='final'
#CONTIGS_NAME=de_novo
#coronaspades_name='contigs'
#coronaspades_code='coronaspades_default'
#contigs_base_folder='coronaspades_default'
#ALIGN_OPTS_NAME='default'

#de_novo='spades'
#contig_mer='final'
#CONTIGS_NAME=de_novo
#spades_name='contigs'
#spades_code='spades_careful'
#contigs_base_folder='spades_careful'
#ALIGN_OPTS_NAME='careful'

PREPROC='fastp'
SORT_TOOL='samtools' #or gatk

In [94]:
#bed_file=prep_bed(create=True)

In [95]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'

SRAs=['SRR22936419','SRR22936420','SRR22936421','SRR22936422', \
      'SRR22936541','SRR22936544','SRR22936770','SRR22936773']

#workflow(bed_file=None)

In [96]:
REF_NAME='Sscrofa11.1.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/'
#TAXA_TAB_DEL=''

REF_EXT='.fna'
fasta_file=REF_PATH+REF_NAME
bed_file=REF_PATH+'bed/'+REF_NAME.split(REF_EXT)[0]+'.bed'
ALIGN_NAME=REF_NAME.split(REF_EXT)[0]

In [97]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'

SRAs=['SRR22936419','SRR22936420','SRR22936421','SRR22936422', \
      'SRR22936541','SRR22936544','SRR22936770','SRR22936773']

#workflow(bed_file=None)

#### Bamboo rat/Sus scrofa

In [98]:
REF_NAME='Sscrofa11.1.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/'
#TAXA_TAB_DEL=''

REF_EXT='.fna'
fasta_file=REF_PATH+REF_NAME
bed_file=REF_PATH+'bed/'+REF_NAME.split(REF_EXT)[0]+'.bed'
ALIGN_NAME=REF_NAME.split(REF_EXT)[0]

In [99]:
#bed_file=prep_bed(create=True)

In [100]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'

SRAs=['SRR22936497']

#workflow(bed_file=None)

In [101]:
REF_NAME='RhiPru_1.0.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/'
#TAXA_TAB_DEL=''

REF_EXT='.fna'
fasta_file=REF_PATH+REF_NAME
bed_file=REF_PATH+'bed/'+REF_NAME.split(REF_EXT)[0]+'.bed'
ALIGN_NAME=REF_NAME.split(REF_EXT)[0]

In [102]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'

SRAs=['SRR22936497']

#workflow(bed_file=None)

In [103]:
#### Concatenated

In [104]:
REF_NAME='rhipru_sus.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/'
#TAXA_TAB_DEL=''

REF_EXT='.fna'
fasta_file=REF_PATH+REF_NAME
bed_file=REF_PATH+'bed/'+REF_NAME.split(REF_EXT)[0]+'.bed'
ALIGN_NAME=REF_NAME.split(REF_EXT)[0]

In [105]:
PRJ='PRJNA901878'

PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'

SRAs=['SRR22936497']

workflow(bed_file=None)

SRR22936497


### PRJCA002517

Manis javanica with variable Homo sapiens and Chlorocebus sabaeus content

In [32]:
REF_NAME='manjav_human_vero.fna'
REF_PATH='/mnt/1TB_0/Data/fasta/mammal_genomes/'
#TAXA_TAB_DEL=''

REF_EXT='.fna'
fasta_file=REF_PATH+REF_NAME
bed_file=REF_PATH+'bed/'+REF_NAME.split(REF_EXT)[0]+'.bed'
ALIGN_NAME=REF_NAME.split(REF_EXT)[0]

In [33]:
PRJ='PRJCA002517'

PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'


SRAs=['CRR477154','CRR477155','CRR477156','CRR477157']

workflow(bed_file=None)

CRR477154
CRR477155
CRR477156
CRR477157
