In [1]:
import pathlib
import subprocess
import os
import numpy as np
import csv
import pysam
import deeptools.countReadsPerBin as crpb
import deeptools.mapReduce as MapReduce
import pysamstats
import matplotlib.pyplot as plt
import re

### SAM format

https://samtools.github.io/hts-specs/SAMv1.pdf

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
PRJ='PRJNA607174'
PRJ_OUT_PATH=f'/mnt/1TB_0/Data/Assembly/{PRJ}/'
BASE_PATH=f'/mnt/6TB_0/Data/genbank/{PRJ}/'
ALIGN_METHOD='bwa_mem2'
MIN_LENGTH=15

In [4]:
REF='hCoV-19/pangolin/Guangdong/1/2019|EPI_ISL_410721|2019'
REF_CODE='GD_1'

In [5]:
SRA='SRR13053879'
BAM_POST='_gd1_amplicon_seq_GD_1_soft_clip_bwamem2_gatk_sorted_marked.bam'
FA_POST=f'_reads_{REF_CODE}_soft_clip_bwamem2_gatk_sorted_marked.fa'

### bwa mem2

soft clipped

In [6]:
def read_bam(in_path, align_method, sra, file_postfix):
    bam_file=in_path+sra+f'/{align_method}/'+f'{sra}{file_postfix}'
    samfile = pysam.AlignmentFile(bam_file, "rb")
    return samfile

In [7]:
def get_S_ends(read):
    '''Note we limit to a minimum length'''
    cig=read.cigar
    start_soft_clipped=0
    end_soft_clipped=0
    if len(cig)>1:
        if str(cig[0][0])=='4':
            start_soft_clipped=int(cig[0][1])
            if start_soft_clipped<MIN_LENGTH:
                start_soft_clipped=0
        if str(cig[-1][0])=='4':
            end_soft_clipped=int(cig[-1][1])
            if end_soft_clipped<MIN_LENGTH:
                end_soft_clipped=0
    return start_soft_clipped,end_soft_clipped

In [8]:
def get_soft_clipped(samfile, refseq):
    start_clip_heads=[]
    start_clip_seqs=[]
    end_clip_heads=[]
    end_clip_seqs=[]
    both_ends=[]
    start_only=[]
    end_only=[]
    for read in samfile.fetch(refseq):
        start_soft_clipped,end_soft_clipped=get_S_ends(read)
        qname=read.qname
        seq=read.seq
        if int(start_soft_clipped)>0:
            start_clip_heads.append(f'{qname} cigar:{read.cigarstring} start:{start_soft_clipped}')
            start_clip_seqs.append(f'{seq[:start_soft_clipped]}')
        if int(end_soft_clipped)>0:
            eq=len(seq)-end_soft_clipped
            end_clip_heads.append(f'{qname} cigar:{read.cigarstring} end:{end_soft_clipped}')
            end_clip_seqs.append(f'{seq[eq:]}') 
        if int(start_soft_clipped)>0 and int(end_soft_clipped)>0:
            both_ends.append(qname)
        elif int(start_soft_clipped)>0:
            start_only.append(qname)
        elif int(end_soft_clipped)>0:
            end_only.append(qname)
    return start_clip_heads,start_clip_seqs, end_clip_heads,end_clip_seqs, both_ends, start_only, end_only

In [9]:
def write_fasta(clip_heads,clip_seqs, fname):
    with open(fname, 'w') as fa:
        for i in range(len(clip_heads)):
            fa.write(f'>{clip_heads[i]}\n')
            fa.write(f'{clip_seqs[i]}\n')

In [10]:

samfile=read_bam(PRJ_OUT_PATH, ALIGN_METHOD, SRA, BAM_POST)
spth=PRJ_OUT_PATH+SRA+f'/{ALIGN_METHOD}/soft_clip/'
pathlib.Path(spth).mkdir(exist_ok=True)

start_clip_heads,start_clip_seqs, end_clip_heads,end_clip_seqs,both_ends, start_only, end_only=get_soft_clipped(samfile, REF)
print(f'{REF_CODE} for min len. {MIN_LENGTH} start clipped: {len(start_clip_heads)}, end clipped: {len(end_clip_heads)}, both_ends: {len(both_ends)}, start_only {len(start_only)}, end_only: {len(end_only)}')
#start soft clipped
fasta_postfix='_start_soft'+FA_POST
start_out=spth+f'{SRA}_{REF_CODE}_ml{MIN_LENGTH}_{fasta_postfix}'
write_fasta(start_clip_heads,start_clip_seqs, start_out)
#end soft clipped
fasta_postfix='_end_soft'+FA_POST
end_out=spth+f'{SRA}_{REF_CODE}_ml{MIN_LENGTH}_{fasta_postfix}'
write_fasta(end_clip_heads,end_clip_seqs, end_out)

GD_1 for min len. 15 start clipped: 149, end clipped: 146, both_ends: 79, start_only 70, end_only: 67
