In [1]:
import matplotlib.pyplot as plt
import matplotlib.patches as mplpatches
import numpy as np
import sys
import time

In [111]:
sys.path.insert(0, '/private/groups/brookslab/gabai/tools/seqUtils/src/')
from seqUtil import *
from nanoUtil import *
from bamUtil import *
from nntUtil import *

In [145]:
chrom, qStart, qEnd
step = 40
signalWindow = 400
kmerWindow = 75
bins = np.arange(qStart, qEnd, step)
models = {
            'resnet1D':resnet1D
            }
mymodel = models['resnet1D']
model = mymodel

myweight =  '/private/groups/brookslab/gabai/tools/seqUtils/src/nanopore_classification/best_models/addseq_resnet1d.pt'
weight = myweight

In [None]:
# input files
bam = '/private/groups/brookslab/gabai/projects/Add-seq/data/chrom/mapping/chrom.sorted.bam'
genome = '/private/groups/brookslab/gabai/projects/Add-seq/data/ref/sacCer3.fa'
chrII_evt = '/data/scratch/gabai/addseq_data/eventalign/chrII.eventalign.txt'

# output files
chrII_sig= '/prfivate/groups/brookslab/gabai/projects/Add-seq/data/chrom/eventalign/rand50_chrII.sigAlign.tsv'

chrII_alignment, chrom, qStart, qEnd = getAlignedReads(bam, region = 'chrII', genome = genome)
parseEventAlign(eventAlign = chrII_alignment, alignment = chrII_alignment, outfile =chrII_sig)

In [172]:
bins = np.arange(qStart, qEnd, step)

In [168]:
class findNemo:

    '''
    class findNemo: predict small molecule modifications from nanopore long-read sequencing data.
    '''
    
    def __init__(self, region, bam, genome, outpath, prefix, eventalign = '', sigalign = ''):
        
        '''
        self:
            self.alignment: dict, stores reference an reads mapped to region.
            self.chrom: str, chromosome name.
            self.qStart: int, query start position.
            self.qEnd: int, query end position
            self.outpath: str, path to store the output files.
        Input:
            region: genomic coordinates to perform modification predictions. Format: 'chrI:2000-5000' or 'chrI'.
            bam: sorted, indexed, and binarized alignment file.
            genome: reference genome fasta file.
        Output:
            outpath: path to store the output files.
            prefix: prefix attached to the output file names.
            eventalign: nanopolish eventalign file.
            sigalign: sigalign file if sigAlign file already exist. If not, must provide eventalign to generate sigAlign file.
        '''

        # Fetch reads aligned to the region
        self.alignment, self.chrom, self.qStart, self.qEnd = getAlignedReads(bam, region = region, genome = genome)
        self.outpath = outpath

        # Index reads to avoid storing the long readnames. 
        self.reads = {r:i for r,i in zip(list(self.alignment.keys())[1:], range(len(self.alignment)-1))}
        # Store the id index match into a file.
        readFh = open(outpath + prefix + region + '_readID.tsv', 'w')
        for k,v in self.reads: readFh.write('{read}\t{index}\n'.format(k,v))
        readFh.close()
        print(len(self.reads), " reads mapped to ", region)

        if sigalign:
            self.sigalign = sigalign
        elif eventalign:
            self.sigalign = outpath + prefix + str(region) + '_sig.tsv'
            parseEventAlign(eventAlign = eventalign, reads = self.reads, outfile = self.sigalign)
        else:
            print('Error: None of sigalign or eventalign files are provided!')

    def doWork(work):
    
        (readID, bins, step, aStart, aEnd, sigList, sigLenList, kmerWindow, signalWindow, device, model, weight) = work
        
        scores = runNNT(readID, bins, step, aStart, aEnd, sigList, sigLenList, kmerWindow, signalWindow, device, model, weight)
        
        return scores
    
    def modPredict(model, weight, threads, step=40, kmerWindow=75, signalWindow=400, load = 1000):
        
        bins = np.arange(self.qStart, self.End, kmerWindow)
        
        device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
        print('Device type: ', device)
        
        # Use the specified threads number or maximum available CPU cores
        num_processes = min(threads, multiprocessing.cpu_count())
        pool = multiprocessing.Pool(processes=num_processes)

        # Total work to be done are stored in a list
        works = [(readID, bins, step, aStart, aEnd, sigList, sigLenList, kmerWindow, signalWindow, device, model, weight) 
                  for readID, aStart, aEnd, strand, sigList, sigLenList in parseSigAlign(self.sigalign, self.alignment)]
        
        for x in range(0, len(works), load):
            # split total work by load
            works_per_load = works[x:x+load]
            
            predOut = outpath + prefix + str(region) + '_' + str(x) + '_prediction.tsv'

            # Use the pool.map() function to process reads in parallel
            outs = pool.map(self.doWork, works_per_load)

            # Close the pool to release resources
            pool.close()
            pool.join()

            # Write the results from current work load
            predOutFh = open(predOut, 'w')
            for r in range(len(outs)):
                out = outs[r]
                readID = works_per_load[r][0]
                bin_start = next(iter(out))
                predOutFh.write('{readID}\t{bin_start}\t{scores}'.format(readID = readID, bin_start = bin_start, scores = ','.join(map(str, out.values()))))
            predOutFh.close()

In [162]:
def writeBedGraph(bedGraphHeader, binScores, chrom, outfile, binCounts = '', step = 40, normalize = False):
    outFh = open(outfile, 'w')
    
    for k,v in bedGraphHeader.items():
        if v:
            line = k + '=' + v + ' '
            outFh.write(line)
    outFh.write('\n')
    
    for chrStart in binScores.keys():
        chrEnd = chrStart + step
        if normalize:
            score = "%.3f" % (binScores[chrStart]/binCounts[chrStart])
        else:
            score = "%.3f" % (binScores[chrStart])
        
        line = '{chr}\t{start}\t{end}\t{score}\n'.format(chr = chrom, start = chrStart,  end = chrEnd, score = score)
        outFh.write(line)
    outFh.close()

bedGraphHeader = {'track type':'bedGraph', 
                  'name':'test', 
                  'description':'test1read',
                  'visibility':'', 
                  'color':'r', 
                  'altColor':'r', 
                  'priority':'', 
                  'autoScale':'off', 
                  'alwaysZero':'off', 
                  'gridDefault':'off', 
                  'maxHeightPixels':'default', 
                  'graphType':'bar',
                  'viewLimits':'upper',
                  'yLineMark':'',
                  'yLineOnOff':'on',
                  'windowingFunction':'mean',
                  'smoothingWindow':'on'
                 }

writeBedGraph(bedGraphHeader, binScores=out[0], chrom = chrom, outfile = '../data/chrom/modPredict/231110_test.bedgraph')

In [120]:
worker_inputs = [(readID, bins, step, aStart, aEnd, sigList, sigLenList, kmerWindow, signalWindow, device, model, weight) 
                  for readID, aStart, aEnd, strand, sigList, sigLenList in parseSigAlign(chrII_sig, chrII_alignment)]

(readID, bins, step, aStart, aEnd, sigList, sigLenList, kmerWindow, signalWindow, device, model, weight) = worker_inputs[0]
binScores = runNNT(readID, bins, step, aStart, aEnd, sigList, sigLenList, kmerWindow, signalWindow, device, model, weight)
out = [binScores]