## Seal kmer matching

Runs Seal “Sequence Expression AnaLyzer” on reads or contigs

See Seal guide here:

https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/seal-guide/


In [1]:
import pathlib
import subprocess
import os

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
import sys
import os
import time
#need to have geomes in cwd
sys.path.insert(0, '/mnt/1TB_0/Data/Code/code/Novel_HKU4r-CoV_single_cell/notebooks')
os.chdir(sys.path[0])

In [4]:
NUM_THREADS=32
LAYOUT='PAIRED'
CREATE_BAM=False
minkmerfraction=0.3
minkmerhits=3

In [5]:
def build_index(ref_file):
    cmd = f"{BBMAP_PATH}bbmap.sh ref={ref_file}"
    subprocess.check_call(cmd, shell=True)

In [6]:
def run_seal_split(fastq_1, fastq_2, out_path, ref_list, sra):
    refs=','.join(ref_list)
    cmd=f'{BBMAP_PATH}seal.sh in={fastq_1} in2={fastq_2} ref={refs} pattern=out_%.fq outu={out_path}{REF_UID}_unmapped.fq ambig={AMBIG} stats={out_path}{REF_UID}_stats.txt refstats={out_path}{REF_UID}_refstats.txt nzo=t threads={NUM_THREADS}'
    print(f'cmd: {cmd}')
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [7]:
def run_seal_abundance(fastq_1, fastq_2, out_path, ref_list, sra):
    refs=','.join(ref_list)
    cmd=f'{BBMAP_PATH}seal.sh in={fastq_1} in2={fastq_2} ref={refs} ambig={AMBIG} stats={out_path}{REF_UID}_MKF{minkmerfraction}_MKH{minkmerhits}_{AMBIG}_stats.txt refstats={out_path}{REF_UID}_MKF{minkmerfraction}_MKH{minkmerhits}_{AMBIG}_refstats.txt rpkm={out_path}{REF_UID}_MKF{minkmerfraction}_MKH{minkmerhits}_{AMBIG}_rpkm.txt minkmerfraction={minkmerfraction} minkmerhits={minkmerhits} nzo=t threads={NUM_THREADS} {XMX} overwrite=true'
    print(f'cmd: {cmd}')
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [8]:
def run_seal_contigs(fasta, out_path, ref_list, sra):
    refs=','.join(ref_list)
    cmd=f'{BBMAP_PATH}seal.sh in={fasta} ref={refs} ambig={AMBIG} stats={out_path}{REF_UID}_{RUN_ID}_{AMBIG}_contigs_stats.txt refstats={out_path}{REF_UID}_{RUN_ID}_{AMBIG}_contigs_refstats.txt rpkm={out_path}{REF_UID}_{RUN_ID}_{AMBIG}_contigs_rpkm.txt nzo=t threads={NUM_THREADS} {XMX} overwrite=true'
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [9]:
def samtools_sort(file_to_sort):
    out_file=file_to_sort.split('.sam')[0]
    out_file=out_file+'_sorted.sam'
    cmd=f'{SAMTOOLS_PATH}samtools sort -O SAM -o {out_file} {file_to_sort}'
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [10]:
def samtoos_coverage(in_file):
    out_file=in_file.split('.sam')[0]
    out_file=out_file+'_coverage.txt'
    cmd=f'{SAMTOOLS_PATH}samtools coverage {in_file} -o {out_file}'
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))


In [11]:
def gatk_sort(bwa_mem_out):
    out_file=bwa_mem_out.split('.sam')[0]
    out_file=out_file+'_gatk_sorted.sam'
    try:
        cmd=f"java -jar ~/apps/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SortSam INPUT={bwa_mem_out} OUTPUT={out_file} SORT_ORDER=coordinate VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [12]:
def gatk_metrics(gatk_out):
    gatk_file=gatk_out.split('.sam')[0]
    out_file=gatk_file+'_marked.bam'
    metrics_file=gatk_file+'_metrics.txt'
    try:
        cmd=f"java -Xmx64G -jar ~/apps/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar  MarkDuplicates INPUT={gatk_out} OUTPUT={out_file} METRICS_FILE={metrics_file} ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
    return out_file

In [13]:
def index_bam(gatk_bam_file):
    try:
        cmd=f"{SAMTOOLS_PATH}samtools index {gatk_bam_file}"
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))

In [14]:
def del_file(f_del):
    cmd=f"rm {f_del}"
    subprocess.check_call(cmd, shell=True)

In [15]:
def convert_sam_fastq(infile):
    outfile=infile.split('.sam')[0]+'.fq'
    cmd=f"java -jar ~/apps/gatk-4.1.9.0/gatk-package-4.1.9.0-local.jar SamToFastq I={infile} FASTQ={outfile}"
    #cmd=f"mv {infile} {outfile}"
    subprocess.check_call(cmd, shell=True)
    return outfile

In [16]:
def run_align():
    for idx, sra in enumerate(SRAs):
        start=time.time()
        if LAYOUT=='PAIRED':
            fastq1=FASTQ_FILES[2*idx]
            fastq2=FASTQ_FILES[(2*idx)+1]
            #f1path=BASE_PATH+sra+'/'+fastq1
            #f2path=BASE_PATH+sra+'/'+fastq2
            f1path=BASE_PATH+fastq1
            f2path=BASE_PATH+fastq2
        else:
            fastq=FASTQ_FILES[idx]
            f1path=BASE_PATH+sra+'/'+fastq
        
        sra_path=PRJ_OUT_PATH+sra
        if not os.path.isfile(sra_path):
            pathlib.Path(sra_path).mkdir(exist_ok=True)
        seal_path=sra_path+'/seal/'
        if not os.path.isfile(seal_path):
            pathlib.Path(seal_path).mkdir(exist_ok=True)

        print(f'{sra} alignment to {REF_UID}') 
        if LAYOUT=='PAIRED':
            run_seal_abundance(f1path, f2path, seal_path, REF_LIST, sra)
        else:
            #TODO
            pass
        elapsed=time.time()-start
        print(f'{sra} elapsed: {elapsed}')



In [17]:
def run_align_contigs():
    for idx, sra in enumerate(SRAs):
        start=time.time()
        fasta=BASE_PATH+sra+f'/megahit_default/{CONTIGS_NAME}'
        seal_path=BASE_PATH+sra+'/seal/'
        if not os.path.isfile(seal_path):
            pathlib.Path(seal_path).mkdir(exist_ok=True)
        print(f'{sra} alignment to {REF_UID}') 
        run_seal_contigs(fasta, seal_path, REF_LIST, sra)
        elapsed=time.time()-start
        print(f'{sra} elapsed: {elapsed}')

In [18]:
SAMTOOLS_PATH=''
BBMAP_PATH='~/apps/bbmap/'

In [19]:
MATCH='unique'
AMBIG='toss'
#REF_UID="rhipru_sus"
#RUN_ID='megahit_default'
XMX='-Xmx280g'

In [20]:
#human
REF_NAME_H='GRCh38.p13.fna'

#pig
REF_NAME_S='Sscrofa11.1.fna'

#Vero cell
REF_NAME_V='Vero_WHO_p1.0.fna'

#YNU_ManJav_2.0 (Pangolin)
REF_NAME_MJ='YNU_ManJav_2.0.fna'

REF_NAME_MP='YNU_ManPten_2.0.fna'

#Bamboo rat
REF_NAME_R='RhiPru_1.0.fna'

#cat
REF_NAME_C='Fca126_mat1.0.fna'

#goat
REF_NAME_G='CapAeg_1.0.fna'

#Choloepus didactylus
REF_NAME_XS="mChoDid1.pri.fna"


## Reads

In [21]:
PRJ='PRJNA901878'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_0/Data/genbank/{PRJ}/'


SRAs=['SRR22936419','SRR22936420','SRR22936421','SRR22936422', \
      'SRR22936541','SRR22936544','SRR22936770','SRR22936773']

#### bamboo rat

In [24]:
REF_LIST=[REF_NAME_R, REF_NAME_S]
REF_UID='RhiPru_1.0_Sscrofa11.1'

SRAs=['SRR22936497']

FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')

In [25]:
run_align()

SRR22936497 alignment to RhiPru_1.0_Sscrofa11.1
cmd: ~/apps/bbmap/seal.sh in=/mnt/8TB_0/Data/genbank/PRJNA901878/SRR22936497_1_fastp.fastq in2=/mnt/8TB_0/Data/genbank/PRJNA901878/SRR22936497_2_fastp.fastq ref=RhiPru_1.0.fna,Sscrofa11.1.fna ambig=toss stats=/mnt/8TB_2/Data/Assembly/PRJNA901878/SRR22936497/seal/RhiPru_1.0_Sscrofa11.1_MKF0.3_MKH3_toss_stats.txt refstats=/mnt/8TB_2/Data/Assembly/PRJNA901878/SRR22936497/seal/RhiPru_1.0_Sscrofa11.1_MKF0.3_MKH3_toss_refstats.txt rpkm=/mnt/8TB_2/Data/Assembly/PRJNA901878/SRR22936497/seal/RhiPru_1.0_Sscrofa11.1_MKF0.3_MKH3_toss_rpkm.txt minkmerfraction=0.3 minkmerhits=3 nzo=t threads=32 -Xmx280g overwrite=true
SRR22936497 elapsed: 1412.308405637741


In [23]:
PRJ='PRJCA002517'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_2/Data/bigd/{PRJ}/'

SRAs=['CRR477154','CRR477155','CRR477156','CRR477157']

In [24]:
PRJ='PRJNA845961'
PRJ_OUT_PATH=f'/mnt/8TB_1/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_1/Data/genbank/{PRJ}/'

SRAs=['SRR19632971','SRR19633001', 'SRR19632976','SRR19632991','SRR19632995','SRR19632997','SRR19633000']

In [25]:
PRJ='PRJNA747757'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_1/Data/genbank/{PRJ}/'

SRAs=['SRR15199660' 'SRR15199665', 'SRR15199666' 'SRR15199667', \
        'SRR15199668', 'SRR15199670', 'SRR15199671', 'SRR15199672', \
        'SRR15199675', 'SRR15199659', 'SRR15199661', 'SRR15199664', \
        'SRR15199676', 'SRR15199663', 'SRR15199669']

In [26]:
FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')

In [27]:
#run_align()

## Contigs


In [28]:
PRJ='PRJCA002517'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'

SRAs=['CRR477154','CRR477155','CRR477156','CRR477157']

In [29]:
FASTQ_FILES=[]
for f in SRAs:
    FASTQ_FILES.append(f+'_1_fastp.fastq')
    FASTQ_FILES.append(f+'_2_fastp.fastq')

In [30]:
REF_LIST=[REF_NAME_MJ, REF_NAME_H, REF_NAME_V]
REF_UID='manjav_human_vero'
RUN_ID='megahit_default'
CONTIGS_NAME='final.contigs.fa'

In [31]:
#run_align_contigs()

In [32]:
RUN_ID='min_len_300_megahit_default'
CONTIGS_NAME='final_min_len_300.contigs.fa'

In [33]:
#run_align_contigs()

#### PRJ901878

In [34]:
REF_LIST=[REF_NAME_MJ, REF_NAME_S]
REF_UID='manjav_sus'
RUN_ID='min_len_300_megahit_default'
CONTIGS_NAME='final_min_len_300.contigs.fa'

In [35]:
PRJ='PRJNA901878'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'

#done 'SRR22936419','SRR22936420','SRR22936421', 'SRR22936422'
SRAs=['SRR22936541','SRR22936544','SRR22936770','SRR22936773']

In [36]:
#run_align_contigs()

SRR22936541 alignment to manjav_sus
SRR22936541 elapsed: 535.3784248828888
SRR22936544 alignment to manjav_sus
SRR22936544 elapsed: 447.8898649215698
SRR22936770 alignment to manjav_sus
SRR22936770 elapsed: 432.3830244541168
SRR22936773 alignment to manjav_sus
SRR22936773 elapsed: 418.08904337882996


##### Bamboo rat/Sus scrofa

In [37]:
REF_LIST=[REF_NAME_R, REF_NAME_S]
REF_UID='rhipru_sus'
RUN_ID='min_len_300_megahit_default'
CONTIGS_NAME='final_min_len_300.contigs.fa'

In [38]:
PRJ='PRJNA901878'
PRJ_OUT_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/8TB_2/Data/Assembly/{PRJ}/'

SRAs=['SRR22936497']

In [39]:
#run_align_contigs()

SRR22936497 alignment to rhipru_sus
SRR22936497 elapsed: 512.7499215602875
