In [1]:
import sys
sys.path.insert(0, '/private/groups/brookslab/gabai/tools/seqUtils/src/')
import time
import numpy as np
from seqUtil import *
from bamUtil import *
from nanoUtil import *
from nntUtil import *
from modPredict import *
import matplotlib.pyplot as plt
import matplotlib.patches as mplpatches
from collections import defaultdict
from plotUtil import *

Device type:  cpu


In [2]:
nuc_regions = {
    'PHO5': 'chrII:429000-435000',
    'CLN2': 'chrXVI:66000-67550',
    'HMR': 'chrIII:290000-299000',
    'AUA1': 'chrVI:114000-116000',
    'EMW1': 'chrXIV:45000-50000',
    'NRG2': 'chrII:370000-379000',
    'RDN37': 'chrXII:450300-459300'}

In [3]:
myregion = nuc_regions['AUA1']
reg = myregion.split(':')
chrom, pStart, pEnd = reg[0], int(reg[1].split('-')[0]), int(reg[1].split('-')[1])

genome = '/private/groups/brookslab/gabai/projects/Add-seq/data/ref/sacCer3.fa'
chrom_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/chrom/mapping/all_read.bam'
pos_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.500.pass.sorted.bam'
neg_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.0.pass.sorted.bam'

In [4]:
method = 'median'

models = {
    'resnet1D':resnet1D
}
mymodel = models['resnet1D']
myweight =  '/private/groups/brookslab/gabai/tools/seqUtils/src/nanopore_classification/best_models/addseq_resnet1d.pt'

In [5]:
sigAlign_AUA1_chrom = '/private/groups/brookslab/gabai/projects/Add-seq/data/chrom/modPredict/231024_AUA1_chrom_meanScore_medianPos_chrVI:114000-116000siganlAlign.tsv'
sigAlign_AUA1_neg = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/modPredict/231024_AUA1_neg_meanScore_medianPos_chrVI:114000-116000siganlAlign.tsv'
sigAlign_AUA1_pos = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/modPredict/231024_AUA1_pos_meanScore_medianPos_chrVI:114000-116000siganlAlign.tsv'


In [7]:
def aggregate_scors(scores, method):
    if method == 'mean':
        return np.nanmean(scores)
    elif method == 'median':
        return np.nanmedian(scores)
    elif method == 'min':
        return np.nanmin(scores)
    elif method == 'max':
        return np.nanmax(scores)

In [8]:
def nntPredict(signals, device, model, weights_path, sigWindow = 400, method = 'mean'):
    '''
    Given a list of signals, return predicted modification scores.
    '''
    
    model.load_state_dict(torch.load(weights_path, map_location=torch.device(device)))
    model.to(device)
    
    # set input data
    input_tensor = torch.zeros((1, 1, 400)).to(device)
    sequence_tensor = torch.tensor(signals)
    
    probs = []
    
    #Here I omit +1 from len(signals)-sigWindow+1 because len(signals) already has one extra signal that corresponds to next kmer window.
    for sigIdx in range(len(signals)-sigWindow):
        input_tensor[:, :, :] = sequence_tensor[sigIdx:sigIdx+sigWindow]
        prob = model(input_tensor).sigmoid().item()
        probs.append(prob)
    
    return aggregate_scors(probs, method = method)

In [46]:
def exportBedGraph(region, sam, sigAlign, kmerWindow=80, signalWindow=400, threshold = 0.7,
                       modBase = ['AT', 'TA'], genome = genome, model = mymodel, weight = myweight):

    r = 0
    alignment = getAlignedReads(sam = sam, region = region, genome=genome, print_name=False)
    refSeq = alignment['ref']
    all_scores, modCounts, modVars = defaultdict(list), defaultdict(list), defaultdict(list)
    modPositions = basePos(refSeq, base = modBase)
    count = baseCount(refSeq, base = modBase)
    
    reg = region.split(':')
    chrom, pStart, pEnd = reg[0], int(reg[1].split('-')[0]), int(reg[1].split('-')[1])
    
    bins = np.arange(pStart, pEnd, kmerWindow)
    binScores = {bin:0 for bin in bins}
    binCounts = {bin:0 for bin in bins}

    for readID, eventStart, sigList, sigLenList in parseSigAlign(sigAlign):
        start_time = time.time()
        print('Start processing ', readID)
        # strand = alignment[readID][1]
        
        sigLenList_init = pStart-eventStart-1
        if sigLenList_init > len(sigLenList):
            continue
        
        for bin in bins:
            # print('Predicting at position:', bin, '-', bin+kmerWindow)
            
            # 1. Fetch sequences with kmer window size, this step is optional
            seq = refSeq[bin:bin+kmerWindow]
            
            # 2. Fetch signals with signal window size 
            signals = fetchSignal(bin-pStart, sigLenList_init, sigLenList, sigList, kmerWindow, signalWindow)
            if signals == 'del':
                continue
            if signals == 'end':
                break
            # print('fetched signal length', len(signals))
            
            # 3. Get predicted probability score from machine learning model
            prob = nntPredict(signals, device = device, model = model, weights_path = weight)
            if prob > 0.7:
                binScores[bin] +=1
            binCounts[bin] +=1
    return binScores, binCounts

In [47]:
def fetchSignal(pos, sigLenList_init, sigLenList, sigList, kmerWindow, signalWindow):
    '''
    fetchSignal return a list of signals that are aligned to the givnen position.
    input:
        pos: genome position relative to pStart
        sigLenList_init: index in signlenList that aligns to pStart
        sigLenList: a list with length of signals aligned to each genomic position
        sigList: a list of signals generated from one read.
        kmerWindow: length of kmers to fetch signals.
    '''
    #### Explanation of how signals are fetched according to genomic position of kmer ####
    ## genome: ACccgttagctaTAAACGTA, siglenList = [4,10,12,19,29,69, 110, 129, 140, 168], kmerWindow = 5 ##
    ## sigLenList_init = 2 (c), pos = 3 (t), kmer = ttagc ##
    ## sigLenList_startIdx = sigLenList_init+pos-1 = 4, sigLenList_endIdx = pos_sigLenList_start+kmerWindow = 9 ##
    ## sigList_startIdx = int(siglenList[sigLenList_startIdx]) = 29, sigList_endIdx = int(siglenList[sigLenList_endIdx]) = 168 ##
    
    sigLenList_startIdx = int(sigLenList_init)+pos-1
    sigLenList_endIdx = sigLenList_startIdx+kmerWindow
    
    if sigLenList_startIdx<0:
        sigList_startIdx = 0
    else:
        sigList_startIdx = int(sigLenList[sigLenList_startIdx])
    
    if sigLenList_endIdx > len(sigLenList):
        sigLenList_endIdx = -1
    sigList_endIdx = int(sigLenList[sigLenList_endIdx])

    # if no signals aligned to this kmer
    if sigList_startIdx == sigList_endIdx:
        print('del')
        return 'del'
    
    # reached the end of the signal list
    # sigList = [0,1,2,3,...,11,12,13,14,15], signalWindow = 5, start = 12, len(sigList) = 16
    # len(sigList)-end < signalWindow does not matter because python automatically clipps
    if len(sigList)-sigList_startIdx < signalWindow:
        print('end')
        return 'end'
    
    signals = [float(s) for s in sigList[sigList_startIdx:sigList_endIdx]]
    
    return signals

In [None]:
chrom_binScores, chrom_binCounts = exportBedGraph(region = myregion, sam = chrom_bam, sigAlign = sigAlign_AUA1_chrom, genome = genome, 
                                                  model= mymodel, weight = myweight, kmerWindow=75)

Start processing  908f9524-d405-46ca-a596-c4e613de196f
Start processing  9c17f73c-0929-43d6-9925-680bb8336e3b
Start processing  b8c85d5f-6412-4d19-a35e-5968d666ea4b
Start processing  c661c150-cfe9-4051-9e7a-0fdafd3c6648
Start processing  7e999093-192e-402a-9d6c-7897fbe59a58
Start processing  c6860b25-f09f-4414-a55b-760d2fddfffb
Start processing  310bb002-6da6-489c-82a1-ec409985fac4
Start processing  7adc3f78-586b-4067-b5db-8bf8d7f02868
Start processing  80c0673a-a6a4-48b6-9bf9-d8105a895005
Start processing  d8161174-9ff7-45d4-89df-43c637a9d5cc
Start processing  de476561-751c-4603-aaf7-d4037d67f726


In [None]:
out = open('', 'w')
postoscore = {}
for line in open('angelicin_pos_chrX_eventalign_thresh80_fullchr_nokmernum_nosignalmean_normalizedbyreadcount.wig'): #'angelicin_chr_chrX_eventalign_thresh80_nokmernum_normalizedbyreadcount.wig'):
    if line[:3] != 'var':
        line = line.rstrip().split('\t')
        postoscore[int(line[0])] = float(line[1])

chrend = max(postoscore.keys())
for i in range(0, chrend, 10):
    windowscore = 0
    for j in range(i, i+50, 10):
        if j in postoscore: windowscore += postoscore[j]
    if windowscore > 5:
        out.write('chrX\t' + str(i) + '\t' + str(i+50) + '\n')
out.close()

In [None]:
def writeBedGraph(bedGraphHeader, binScores, binCounts, binSize, chrom, outfile, normalize = True):
    outFh = open(outfile, 'w')
    for k,v in bedGraphHeader.items():
        if v:
            line = k + '=' + v + ' '
            outFh.write(line)
    outFh.write('\n')
    for chrStart in binScores.keys():
        chrEnd = chrStart + binSize
        if normalize:
            score = "%.3f" % (binScores[chrStart]/binCounts[chrStart])
        else:
            score = "%.3f" % (binScores[chrStart])
        line = '{chr}\t{start}\t{end}\t{score}\n'.format(chr = chrom, start = chrStart,  end = chrEnd, score = score)
        print(line)
        outFh.write(line)
    outFh.close()

bedGraphHeader = {'track type':'bedGraph', 
                  'name':'chrom_AUA1', 
                  'description':'addseq',
                  'visibility':'', 
                  'color':'r', 
                  'altColor':'r', 
                  'priority':'', 
                  'autoScale':'off', 
                  'alwaysZero':'off', 
                  'gridDefault':'off', 
                  'maxHeightPixels':'default', 
                  'graphType':'bar',
                  'viewLimits':'upper',
                  'yLineMark':'',
                  'yLineOnOff':'on',
                  'windowingFunction':'mean',
                  'smoothingWindow':'on'
                 }

In [None]:
writeBedGraph(bedGraphHeader = bedGraphHeader, binScores = chrom_binScores, binCounts = chrom_binCounts, 
              binSize =75, chrom = chrom, outfile = '../data/chrom/modPredict/231106_AUA1_binLevelPrediction.bedgraph')

In [64]:
chrom_binScores.values()

dict_values([2, 1, 6, 1, 6, 6, 2, 0, 1, 0, 5, 1, 3, 1, 2, 0, 0, 1, 1, 2, 0, 1, 1, 0, 6, 4, 0])

In [65]:
','.join(map(str, chrom_binScores.values()))

'2,1,6,1,6,6,2,0,1,0,5,1,3,1,2,0,0,1,1,2,0,1,1,0,6,4,0'

In [53]:
','.join([v for k,v in chrom_binScores.items()])

TypeError: sequence item 0: expected str instance, int found