In [None]:
import sys
sys.path.insert(0, '/private/groups/brookslab/gabai/tools/seqUtils/src/')
import time
import numpy as np
from seqUtil import *
from bamUtil import *
from nanoUtil import *
from nntUtil import *
from modPredict import *
import matplotlib.pyplot as plt
import matplotlib.patches as mplpatches
from collections import defaultdict 

In [8]:
nuc_regions = {
    'PHO5': 'chrII:429000-435000',
    'CLN2': 'chrXVI:66000-67550',
    'HMR': 'chrIII:290000-299000',
    'AUA1': 'chrVI:114000-116000',
    'EMW1': 'chrXIV:45000-50000',
    'NRG2': 'chrII:370000-379000',
    'RDN37': 'chrXII:450300-459300'}

In [9]:
myregion = nuc_regions['CLN2']
reg = myregion.split(':')
chrom, pStart, pEnd = reg[0], int(reg[1].split('-')[0]), int(reg[1].split('-')[1])

In [10]:
genome = '/private/groups/brookslab/gabai/projects/Add-seq/data/ref/sacCer3.fa'
chrom_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/chrom/mapping/all_read.bam'
pos_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.500.pass.sorted.bam'
neg_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.0.pass.sorted.bam'

In [11]:
method = 'median'

models = {
    'resnet1D':resnet1D
}
mymodel = models['resnet1D']
myweight =  '/private/groups/brookslab/gabai/tools/seqUtils/src/nanopore_classification/best_models/addseq_resnet1d.pt'

In [12]:
sigAlign_CLN2_chrom = '/private/groups/brookslab/gabai/projects/Add-seq/data/chrom/modPredict/231024_CLN2_chrom_meanScore_medianPos_chrXVI:66000-67550siganlAlign.tsv'
sigAlign_CLN2_neg = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/modPredict/231024_CLN2_neg_meanScore_medianPos_chrXVI:66000-67550siganlAlign.tsv'
sigAlign_CLN2_pos = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/modPredict/231024_CLN2_pos_meanScore_medianPos_chrXVI:66000-67550siganlAlign.tsv'

In [19]:
def fetchSignal(pos, sigLenList_init, siglenList, sigList, signalWindow):
    '''
    fetchSignal return a list of signals that are aligned to the givnen position.
    input:
        pos: genome position relative to pStart
        sigLenList_init: index in signlenList that aligns to pStart
        siglenList: a list with length of signals aligned to each genomic position
        sigList: a list of signals generated from one read.
        signalWindow: length of signals to feed into prediction model
    '''
    
    pos_sigLenList_start = int(sigLenList_init)+pos
    pos_sigLenList_end = pos_sigLenList_start+1
    
    if pos_sigLenList_start<0: 
        start=0
    else:
        start = int(siglenList[pos_sigLenList_start])
    
    end = int(siglenList[pos_sigLenList_end])

    # if no signals aligned to this position
    if start == end:
        return 'del'
    
    # reached the end of the signal list
    # sigList = [0,1,2,3,...,11,12,13,14,15], signalWindow = 5, start = 12, len(sigList) = 16
    # len(sigList)-end < signalWindow does not matter because python automatically clipps
    if len(sigList)-start < signalWindow:
        return 'end'
    signals = [float(s) for s in sigList[start:end+signalWindow]]
    
    return signals

In [108]:
def exportBedGraph(region, sam, sigAlign, kmerWindow=80, signalWindow=400, binSize = 75,
                       modBase = ['AT', 'TA'], genome = genome, model = mymodel, weight = myweight):
    
    alignment = getAlignedReads(sam = sam, region = region, genome=genome, print_name=False)
    refSeq = alignment['ref']
    all_scores, modCounts, modVars = defaultdict(list), defaultdict(list), defaultdict(list)
    modPositions = basePos(refSeq, base = modBase)
    count = baseCount(refSeq, base = modBase)
    
    reg = region.split(':')
    chrom, pStart, pEnd = reg[0], int(reg[1].split('-')[0]), int(reg[1].split('-')[1])
    
    bins = np.arange(pStart, pEnd, binSize)
    binScores = {bin:0 for bin in bins}
    binCounts = {bin:0 for bin in bins}

    for readID, eventStart, sigList, siglenList in parseSigAlign(sigAlign):
        print(readID)
        start_time = time.time()
        print('Start processing ', readID)
        strand = alignment[readID][1]
        
        sigLenList_init = pStart-eventStart-1
        if sigLenList_init > len(siglenList):
            continue
        for pos in range(len(refSeq)):
            if pos % 500 == 0:
                print('Predicting at position:', pos)

            # 1. Fetch sequences with kmer window size, this step is optional
            seq = refSeq[pos:pos+kmerWindow]
            
            # 2. Fetch signals with signal window size 
            signals = fetchSignal(pos, sigLenList_init, siglenList, sigList, signalWindow)
            if signals == 'del':
                continue
            elif signals == 'end':
                break
            
            # 3. Get predicted probability score from machine learning model
            prob = nntPredict(signals, device = device, model = model, weights_path = weight)

            idx = np.searchsorted(bins, pStart+pos, side='right')
            binScores[bins[idx-1]] +=prob
            binCounts[bins[idx-1]] +=1                    
    return binScores, binCounts

def writeBedGraph(bedGraphHeader, binScores, binCounts, binSize, chrom, outfile):
    outFh = open(outfile, 'w')
    for k,v in bedGraphHeader.items():
        if v:
            line = k + '=' + v + ' '
            outFh.write(line)
    outFh.write('\n')
    for chrStart in chrom_binScores.keys():
        chrEnd = chrStart + binSize
        score = "%.3f" % (binScores[chrStart]/binCounts[chrStart])
        line = '{chr}\t{start}\t{end}\t{score}\n'.format(chr = chrom, start = chrStart,  end = chrEnd, score = score)
        outFh.write(line)
    outFh.close()

In [106]:
bedGraphHeader = {'track type':'bedGraph', 
                  'name':'chrom bin=75bp', 
                  'description':'addseq',
                  'visibility':'', 
                  'color':'r', 
                  'altColor':'r', 
                  'priority':'', 
                  'autoScale':'off', 
                  'alwaysZero':'off', 
                  'gridDefault':'off', 
                  'maxHeightPixels':'default', 
                  'graphType':'bar',
                  'viewLimits':'upper',
                  'yLineMark':'',
                  'yLineOnOff':'on',
                  'windowingFunction':'mean',
                  'smoothingWindow':'on'
                 }

In [None]:
chrom_binScores, chrom_binCounts = exportBedGraph(region = myregion, sam = chrom_bam, sigAlign = sigAlign_CLN2_chrom, binSize = 75)
writeBedGraph(bedGraphHeader = bedGraphHeader, binScores = chrom_binScores, binCounts = chrom_binCounts, 
              binSize =75, chrom = chrom, outfile = '../data/chrom/modPredict/231102_CLN2.bedgraph')

c22b0e4d-d5f7-45db-96dc-ae2e46248e2a
Start processing  c22b0e4d-d5f7-45db-96dc-ae2e46248e2a
Predicting at position: 0
Predicting at position: 500
Predicting at position: 1000
Predicting at position: 1500
cb52cb95-f5e4-4f15-be1a-aa93f7f80dd0
Start processing  cb52cb95-f5e4-4f15-be1a-aa93f7f80dd0
Predicting at position: 0
Predicting at position: 500
Predicting at position: 1000
Predicting at position: 1500
04adb81d-a602-44eb-a4c1-ab3308fff401
Start processing  04adb81d-a602-44eb-a4c1-ab3308fff401
Predicting at position: 0
Predicting at position: 500
