In [9]:
from multiprocessing.pool import ThreadPool as Pool
from multiprocessing import Pool
import sys
sys.path.insert(0, '/private/groups/brookslab/gabai/tools/seqUtils/src/')
import time
import numpy as np
from seqUtil import *
from bamUtil import *
from nanoUtil import *
from nntUtil import *
from modPredict import *
import matplotlib.pyplot as plt
import matplotlib.patches as mplpatches
from collections import defaultdict
from plotUtil import *
import threading

In [10]:
nuc_regions = {
    'PHO5': 'chrII:429000-435000',
    'CLN2': 'chrXVI:66000-67550',
    'HMR': 'chrIII:290000-299000',
    'AUA1': 'chrVI:114000-116000',
    'EMW1': 'chrXIV:45000-50000',
    'NRG2': 'chrII:370000-379000',
    'RDN37': 'chrXII:450300-459300'}

In [11]:
myregion = nuc_regions['CLN2']
reg = myregion.split(':')
chrom, pStart, pEnd = reg[0], int(reg[1].split('-')[0]), int(reg[1].split('-')[1])

genome = '/private/groups/brookslab/gabai/projects/Add-seq/data/ref/sacCer3.fa'
chrom_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/chrom/mapping/all_read.bam'
pos_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.500.pass.sorted.bam'
neg_bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/mapping/unique.0.pass.sorted.bam'

In [12]:
method = 'median'

models = {
    'resnet1D':resnet1D
}
mymodel = models['resnet1D']
myweight =  '/private/groups/brookslab/gabai/tools/seqUtils/src/nanopore_classification/best_models/addseq_resnet1d.pt'

In [13]:
sigAlign_CLN2_chrom = '/private/groups/brookslab/gabai/projects/Add-seq/data/chrom/modPredict/231024_CLN2_chrom_meanScore_medianPos_chrXVI:66000-67550siganlAlign.tsv'
sigAlign_CLN2_neg = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/modPredict/231024_CLN2_neg_meanScore_medianPos_chrXVI:66000-67550siganlAlign.tsv'
sigAlign_CLN2_pos = '/private/groups/brookslab/gabai/projects/Add-seq/data/ctrl/modPredict/231024_CLN2_pos_meanScore_medianPos_chrXVI:66000-67550siganlAlign.tsv'

In [19]:
def nucPredict(readID, eventStart, sigList, sigLenList, pStart, bins, refSeq, kmerWindow, signalWindow, threshold,
               device, model, weight):
    print('Start processing ', readID)
    
    start_time = time.time()
    sigLenList_init = pStart-eventStart-1
    binScores = {bin:0 for bin in bins}
    binCounts = {bin:0 for bin in bins}
    
    for bin in bins:
        # print('Predicting at position:', bin, '-', bin+kmerWindow)
        # 1. Fetch sequences with kmer window size, this step is optional
        seq = refSeq[bin:bin+kmerWindow]
        print(seq[:10])
        # 2. Fetch signals with signal window size 
        signals = fetchSignal(bin-pStart, sigLenList_init, sigLenList, sigList, kmerWindow, signalWindow)
        if signals == 'del':
            continue
        if signals == 'end':
            break
        # 3. Get predicted probability score from machine learning model
        prob = nntPredict(signals, device = device, model = model, weights_path = weight)
        if prob > threshold:
            binScores[bin]+=1
        binCounts[bin] +=1

    total_time = "%.2f" % (start_time-time.time())
    
    print('finished processing ', readID, ' in ', total_time, 's.')
    
    return binScores, binCounts

In [14]:
def process_read(worker_input):
    (readID, eventStart, sigList, sigLenList, pStart, bins, refSeq, 
     kmerWindow, signalWindow, threshold, device, model, weight) = worker_input
    
    binScores, binCounts = nucPredict(readID, eventStart, sigList, sigLenList, pStart, bins, refSeq, kmerWindow, signalWindow, threshold, device, model, weight)

    return binScores, binCounts

In [17]:
def exportBedGraph(region, sam, sigAlign, kmerWindow=75, signalWindow=400, threshold = 0.7, modBase = ['AT', 'TA'], 
                   genome = genome, model = mymodel, weight = myweight, device = device, pool_size = 3):

    alignment = getAlignedReads(sam = sam, region = region, genome=genome, print_name=False)
    refSeq = alignment['ref']    
    reg = region.split(':')
    chrom, pStart, pEnd = reg[0], int(reg[1].split('-')[0]), int(reg[1].split('-')[1])
    
    bins = np.arange(pStart, pEnd, kmerWindow)

   
    num_processes = min(pool_size, multiprocessing.cpu_count())  # Use the specified pool size or maximum available CPU cores
    pool = multiprocessing.Pool(processes=num_processes)

    # Prepare worker input data for parallel processing
    worker_inputs = [(readID, eventStart, sigList, sigLenList, pStart, bins, refSeq, kmerWindow, signalWindow, 
                      threshold, device, model, weight) for readID, eventStart, sigList, sigLenList in parseSigAlign(sigAlign)]

    # Use the pool.map() function to process reads in parallel
    results = pool.map(process_read, worker_inputs)

    # Close the pool to release resources
    pool.close()
    pool.join()

    return results

In [15]:
results = exportBedGraph(region = myregion, sam = chrom_bam, sigAlign = sigAlign_CLN2_chrom, genome = genome, 
                                                  model= mymodel, weight = myweight, kmerWindow=75)




Start processing  c22b0e4d-d5f7-45db-96dc-ae2e46248e2a

Start processing  cb52cb95-f5e4-4f15-be1a-aa93f7f80dd0

Start processing  04adb81d-a602-44eb-a4c1-ab3308fff401

Start processing  5efe244d-f0e8-4d5d-a795-bdb90dc93647

Start processing  1a91bbd8-08a4-4b67-9f08-36b760c4fb08

Start processing  298e1adf-29ad-49fb-a560-30dc63c83e2f

finished processing  298e1adf-29ad-49fb-a560-30dc63c83e2f  in  -0.00 s.
Start processing  1ca6d93d-4a00-4024-9768-4c71c4ba43a8

Start processing  c87409f3-7200-43eb-a045-721f62fdb97d

Start processing  f3c2d464-6808-40e9-a4ab-aa7d2cb71d41

Start processing  5c6bca55-e254-492b-8011-9ed0bba1abea

Start processing  62428c75-fc1c-41ad-a2b8-7a665227b7c1

Start processing  6c41591e-6437-4239-9fb2-c858e9bb1af9

Start processing  cb957840-9eab-4204-bfea-55328302a96f

Start processing  1ed9b5ca-03c0-4a28-805a-516024d770b4

Start processing  c09ba610-bed7-44f7-8b51-201e43084b4c

Start processing  1df6b859-227e-47c2-990d-2292a7d7e27c

Start processing  f81df8cb-02

Exception in thread Exception in threading.excepthook:
Exception ignored in thread started by: <bound method Thread._bootstrap of <Thread(Thread-5, stopped 140595888404032)>>
Traceback (most recent call last):
  File "/private/groups/brookslab/gabai/miniconda3/envs/addseq/lib/python3.9/threading.py", line 937, in _bootstrap
    self._bootstrap_inner()
  File "/private/groups/brookslab/gabai/miniconda3/envs/addseq/lib/python3.9/threading.py", line 982, in _bootstrap_inner
    self._invoke_excepthook(self)
  File "/private/groups/brookslab/gabai/miniconda3/envs/addseq/lib/python3.9/threading.py", line 1264, in invoke_excepthook
    local_print("Exception in threading.excepthook:",
  File "/private/groups/brookslab/gabai/miniconda3/envs/addseq/lib/python3.9/site-packages/ipykernel/iostream.py", line 575, in flush
    self.pub_thread.schedule(self._flush)
  File "/private/groups/brookslab/gabai/miniconda3/envs/addseq/lib/python3.9/site-packages/ipykernel/iostream.py", line 267, in schedule

















































































































































































KeyboardInterrupt: 































































In [128]:
def writeBedGraph(bedGraphHeader, binScores, binCounts, binSize, chrom, outfile, normalize = False):
    outFh = open(outfile, 'w')
    for k,v in bedGraphHeader.items():
        if v:
            line = k + '=' + v + ' '
            outFh.write(line)
    outFh.write('\n')
    for chrStart in binScores.keys():
        chrEnd = chrStart + binSize
        if normalize:
            score = "%.3f" % (binScores[chrStart]/binCounts[chrStart])
        else:
            score = "%.3f" % (binScores[chrStart])
        line = '{chr}\t{start}\t{end}\t{score}\n'.format(chr = chrom, start = chrStart,  end = chrEnd, score = score)
        print(line)
        outFh.write(line)
    outFh.close()

bedGraphHeader = {'track type':'bedGraph', 
                  'name':'chrom_bin75', 
                  'description':'addseq',
                  'visibility':'', 
                  'color':'r', 
                  'altColor':'r', 
                  'priority':'', 
                  'autoScale':'off', 
                  'alwaysZero':'off', 
                  'gridDefault':'off', 
                  'maxHeightPixels':'default', 
                  'graphType':'bar',
                  'viewLimits':'upper',
                  'yLineMark':'',
                  'yLineOnOff':'on',
                  'windowingFunction':'mean',
                  'smoothingWindow':'on'
                 }

In [129]:
writeBedGraph(bedGraphHeader = bedGraphHeader, binScores = chrom_binScores, binCounts = chrom_binCounts, normalize=True,
              binSize =75, chrom = chrom, outfile = '../data/chrom/modPredict/231106_CLN2_binLevelPrediction.bedgraph')

chrXVI	66000	66075	0.448

chrXVI	66075	66150	0.481

chrXVI	66150	66225	0.536

chrXVI	66225	66300	0.460

chrXVI	66300	66375	0.469

chrXVI	66375	66450	0.423

chrXVI	66450	66525	0.504

chrXVI	66525	66600	0.453

chrXVI	66600	66675	0.533

chrXVI	66675	66750	0.492

chrXVI	66750	66825	0.455

chrXVI	66825	66900	0.518

chrXVI	66900	66975	0.468

chrXVI	66975	67050	0.463

chrXVI	67050	67125	0.456

chrXVI	67125	67200	0.498

chrXVI	67200	67275	0.459

chrXVI	67275	67350	0.506

chrXVI	67350	67425	0.444

chrXVI	67425	67500	0.531

chrXVI	67500	67575	0.519

