In [1]:
import sys
sys.path.insert(0, '/private/groups/brookslab/gabai/tools/seqUtils/src/')
import time
import numpy as np
from seqUtil import *
from bamUtil import *
from nanoUtil import *
from nntUtil import *
from modPredict import *
import matplotlib.pyplot as plt
import matplotlib.patches as mplpatches
from collections import defaultdict 

Device type:  cpu


In [5]:
from plotUtil import *

In [2]:
nuc_regions = {
    'PHO5': 'chrII:429000-435000',
    'CLN2': 'chrXVI:66000-67550',
    'HMR': 'chrIII:290000-299000',
    'AUA1': 'chrVI:114000-116000',
    'EMW1': 'chrXIV:45000-50000',
    'NRG2': 'chrII:370000-379000',
    'RDN37': 'chrXII:450300-459300'}

In [3]:
myregion = nuc_regions['CLN2']
reg = myregion.split(':')
chrom, pStart, pEnd = reg[0], int(reg[1].split('-')[0]), int(reg[1].split('-')[1])

genome = '/private/groups/brookslab/gabai/projects/Add-seq/data/ref/sacCer3.fa'
chrom_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/chrom/mapping/all_read.bam'
pos_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.500.pass.sorted.bam'
neg_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.0.pass.sorted.bam'

In [4]:
method = 'median'

models = {
    'resnet1D':resnet1D
}
mymodel = models['resnet1D']
myweight =  '/private/groups/brookslab/gabai/tools/seqUtils/src/nanopore_classification/best_models/addseq_resnet1d.pt'

In [6]:
sigAlign_CLN2_chrom = '/private/groups/brookslab/gabai/projects/Add-seq/data/chrom/modPredict/231024_CLN2_chrom_meanScore_medianPos_chrXVI:66000-67550siganlAlign.tsv'
sigAlign_CLN2_neg = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/modPredict/231024_CLN2_neg_meanScore_medianPos_chrXVI:66000-67550siganlAlign.tsv'
sigAlign_CLN2_pos = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/modPredict/231024_CLN2_pos_meanScore_medianPos_chrXVI:66000-67550siganlAlign.tsv'

In [18]:
def exportBedGraphBase(region, sam, sigAlign, kmerWindow=80, signalWindow=400, binSize = 75, threshold = 0.65,
                       modBase = ['AT', 'TA'], genome = genome, model = mymodel, weight = myweight):

    r = 0
    alignment = getAlignedReads(sam = sam, region = region, genome=genome, print_name=False)
    refSeq = alignment['ref']
    all_scores, modCounts, modVars = defaultdict(list), defaultdict(list), defaultdict(list)
    modPositions = basePos(refSeq, base = modBase)
    count = baseCount(refSeq, base = modBase)
    
    reg = region.split(':')
    chrom, pStart, pEnd = reg[0], int(reg[1].split('-')[0]), int(reg[1].split('-')[1])
    
    bins = np.arange(pStart, pEnd, binSize)
    binScores = {bin:0 for bin in bins}
    binCounts = {bin:0 for bin in bins}

    for readID, eventStart, sigList, siglenList in parseSigAlign(sigAlign):
        print(readID)
        start_time = time.time()
        print('Start processing ', readID)
        strand = alignment[readID][1]
        
        sigLenList_init = pStart-eventStart-1
        if sigLenList_init > len(siglenList):
            continue
        for pos in range(len(refSeq)):
            idx = np.searchsorted(bins, pStart+pos, side='right')
            if pos % 500 == 0:
                print('Predicting at position:', pos)

            # 1. Fetch sequences with kmer window size, this step is optional
            seq = refSeq[pos:pos+kmerWindow]
            
            # 2. Fetch signals with signal window size 
            signals = fetchSignal(pos, sigLenList_init, siglenList, sigList, signalWindow)
            if signals == 'del':
                continue
            elif signals == 'end':
                break
            
            # 3. Get predicted probability score from machine learning model
            prob = nntPredict(signals, device = device, model = model, weights_path = weight)
            if prob > threshold:
                print(prob)
                binScores[bins[idx-1]] +=1
            binCounts[bins[idx-1]] +=1
            
            r +=1
        if r >2:
            break
    return binScores, binCounts

In [20]:
chrom_binScores, chrom_binCounts = exportBedGraph(region = myregion, sam = chrom_bam, sigAlign = sigAlign_CLN2_chrom, genome = genome, 
                                                  model= mymodel, weight = myweight, binSize = 75)
writeBedGraph(bedGraphHeader = bedGraphHeader, binScores = chrom_binScores, binCounts = chrom_binCounts, 
              binSize =75, chrom = chrom, outfile = '../data/chrom/modPredict/231102_CLN2_sub5.bedgraph')

c22b0e4d-d5f7-45db-96dc-ae2e46248e2a
Start processing  c22b0e4d-d5f7-45db-96dc-ae2e46248e2a
Predicting at position: 0
0.7059680118280298
0.7021181881427765
0.6572225913405418
0.6809073463082314
0.6559441540692303
0.6512668473379952
0.6532499194145203
0.6878413387707302
0.6517612040042877
0.6501070976257324
0.6742518088396858
0.6519956913861361
0.664443626999855
0.6732080349555383
0.6763992071151733
0.6997683569788933
0.714580462737517
0.6799063713927018
0.6855859955151876
0.6559280753135681
0.7620420098304749
0.7553351095744542
0.7485526502132416
0.7672503803457532
0.7602008921759469
0.7384185261196561
0.6504851520061493
Predicting at position: 500
0.659270649154981
0.6665297746658325
0.6570153459906578
0.7032953053712845
0.6878595935801665
0.6790269679493375
0.6615497680271373
0.6567607588238187
0.6889681432928357
0.7865404188632965
0.7702659144997597
0.7741486579179764
0.7327289352050195
0.7549794912338257
0.6988308802247047
0.7121949593226115
0.7460262945720127
0.693126251300176
0.7