In [1]:
import sys
sys.path.insert(0, '/private/groups/brookslab/gabai/tools/seqUtils/src/')
import time
from seqUtil import *
from bamUtil import *
from nanoUtil import *
from nntUtil import *
from modPredict import *
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [12]:
pos_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.500.pass.sorted.bam'
neg_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.0.pass.sorted.bam'
ref = '/private/groups/brookslab/gabai/projects/Add-seq/data/ref/sacCer3.fa'
pos_evt = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/eventalign/unique.500.eventalign.tsv'
neg_evt = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/eventalign/unique.0.eventalign.tsv'
positions = randomPosition(n=2, genome = ref, windowSize=80)

In [13]:
PHO5_pos_sigAlign= '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/eventalign/PHO5_pos_chrII:429000-435000siganlAlign.tsv'
PHO5_neg_sigAlign= '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/eventalign/PHO5_neg_chrII:429000-435000siganlAlign.tsv'

In [28]:
def modelScores(refSeq, sigList, siglenList, sigStart,
                device, model, weights_path, outfile = '', 
                kmerWindow = 80, sigWindow = 400, modbase = ''):
    
    '''
    position on reference genome aligns to sigLenList[sigStart] on signalList.
    012345678
    accGTCGAa 
    sigList = [00,01,02,03,04,05,..]
    sigLenList = [12,32,51,71,96,26,136]
    GTCGA (idx = 3)-> sigList[51:70+400] -> (sigLenList[idx-1], sigLenLists[idx]-1 + 400)
    TCGAA (idx = 4)-> sigList[71:95+400] -> (sigLenList[idx-1], sigLenList[idx]-1 + 400)
    '''

    outFh = open(outfile, 'w')
    
    for pos in range(len(refSeq)):
        seq = refSeq[pos:pos+kmerWindow]
        
        # Fetch list of signals that are aligned to seq
        pStart_sigLenList = sigStart+pos-1
        if pStart_sigLenList<0: 
            start=0
        else:
            start = int(siglenList[pStart_sigLenList])
        end = int(siglenList[sigStart+pos])-1+400
        signals = [float(s) for s in sigList[start:end]]
        
        freq = baseCount(seq=seq, base = modbase)/len(seq)
        base_pos = basePos(seq, base = modbase)
        modBasePos = ','.join([str(b) for b in base_pos])
        
        prob = nntPredict(signals,device = device, model = model, weights_path = weights_path)
        if pos%500 == 0:
            print('prob:', prob)
            print('Predicitng modification at position: ', pos)
        out = '{seq}\t{prob}\t{freq}\t{base_pos}\n'.format(seq = seq, prob = prob, freq = freq, base_pos = modBasePos)
        outFh.write(out)
    print('Writing output to outfile')
    outFh.close()

In [4]:
def tune_siganl(sigList, min_val=50, max_val=130):
    new_sigList = [max(min_val, min(max_val, float(signal))) for signal in sigList]
    return new_sigList

In [33]:
sigAlignF = PHO5_pos_sigAlign
pStart = 429000
pEnd = 435000
prefix = '231005_PHO5_pos'

In [8]:
alignment = getAlignedReads(sam = pos_bam, region = 'chrII:429000-435000', genome=ref, print_name=False)

In [6]:
rstrand = {r:s[1] for r, s in alignment.items()}

In [21]:
ADDSEQ_FN = '/private/groups/brookslab/gabai/tools/seqUtils/src/nanopore_classification/best_models/addseq_resnet1d.pt'
outPath = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/eventalign/'
myregion = 'chrII:429000-435000'
myweights = '/private/groups/brookslab/gabai/tools/seqUtils/src/nanopore_classification/best_models/addseq_resnet1d.pt'

In [None]:
for readID, sigList, siglenList, sigStart in parseSigAlign(sigAlign=sigAlignF, pStart=pStart, pEnd=pEnd):
    print('Making modification predictions on read: ', readID)
    sigList = tune_siganl(sigList)
    mbase = 'A'
    if rstrand[readID] == -1:
        mbase = ntDict['T']
    output_file = outPath + prefix + '_' + readID + '_' + myregion + 'modelScores.tsv'
    modelScores(refSeq = alignment['ref'], sigList = sigList, siglenList = siglenList, sigStart=sigStart, outfile=output_file,
                device = 'cpu', model = resnet1D, weights_path = myweights, modbase = mbase)

Making modification predictions on read:  a5ec334f-09d4-428c-9bbb-326aed5f8279
prob: 0.4395545721054077
Predicitng modification at position:  0
prob: 0.4895962178707123
Predicitng modification at position:  500
prob: 0.37250691652297974
Predicitng modification at position:  1000
prob: 0.3945353329181671
Predicitng modification at position:  1500
prob: 0.47208234667778015
Predicitng modification at position:  2000
prob: 0.4961753785610199
Predicitng modification at position:  2500
prob: 0.4657871276140213
Predicitng modification at position:  3000
prob: 0.43315139412879944
Predicitng modification at position:  3500
prob: 0.461865097284317
Predicitng modification at position:  4000
prob: 0.4752632975578308
Predicitng modification at position:  4500
