In [1]:
# it converts the safety windows intervals into safety windows using the reference as source
def safetyWindows(data):
    
    nS = data['sequences per cluster']
    R = data['reference']
    intervals = data['safety windows intervals']
    
    safetyWindows = {}
    refSeq = [char for char in R]
    refPos = [0]*len(refSeq)
    
    for s in range(1,nS):
        intSeq = intervals[s]
        windowsList = list()
        for n in range(0,len(intSeq)):
            intB = int(intSeq[n][0])
            intE = int(intSeq[n][1])-1
            windowsList.append(''.join(refSeq[intB:(intE)]))
        safetyWindows[s] = windowsList
    
    data['safety windows'] = safetyWindows
    
    return data;

In [2]:
def overlappingWindowsSize(data):
    
    # removing overlapping details and oversimplify the windows
    
    safetyIntervals = data['safety windows intervals'] # safety windows intervals
    nS = data['number of sequences'] # number of stirngs
    overlaps = [0]*nS 
    overlappingLength = [0]*nS
    
    for i in range(1,nS):
        arrayIntervals = [0]*(len(safetyIntervals[i])*2)
        currentInterval = safetyIntervals[i]
        cnt = 0
        
        for k in range(0,len(currentInterval)):
            arrayIntervals[cnt] = int(currentInterval[k][0])
            arrayIntervals[cnt+1] = int(currentInterval[k][1])
            cnt += 2
        
        # calculate the number of overlapping cases;
        if (all(x <= y for x,y in zip(arrayIntervals, arrayIntervals[1:])) == False):
            overlaps[i] = 1
        
        # remove the overlapping cases in the sequences and using that information to correct length and coverage
        if overlaps[i] != 0:
            arrayPairIntervals = list(split_pairs(arrayIntervals[0:len(arrayIntervals)+1], 2))

            mappingPositions = [-1]*(max(arrayIntervals))

            for j in arrayPairIntervals:
                for l in range(j[0],j[1]):
                    mappingPositions[l] += 1 

            overlappingLength[i] = sum(mappingPositions[l] for l in range(0,max(arrayIntervals)))
                                       
        
    
    data['overlap'] = overlaps
    data['overlapping length'] = overlappingLength
    
    
    return data;

In [3]:
# calculate the length of the safety windows in that subcluster
# calculate the coverage of the safety windows in that subcluster
def coverageAverage(data):
    
    tuples = data['tuples']
    safetyWindows = data['safety windows']
    R = data['reference']
    seqComp = data['sequences']
    overlappingLength = data['overlapping length']
    
    lengthPerSubCluster = [0]*len(tuples)
    coveragePerSubCluster= [0]*len(tuples)
    
    # calculate such values for all tuples based on the criteria that were chosen;
    for i in range(0,len(tuples)):
        lengthPerTuple = 0
        coveragePerTuple = 0
        for j in range(0,len(tuples[i])):
            windowSeq = 0
            elementTuple = tuples[i][j]
            seqID = elementTuple
            windowsPerSeqID = safetyWindows[seqID]
            windowsLength = -(overlappingLength[elementTuple])

            for k in range(0,len(windowsPerSeqID)):
                windowsLength += len(windowsPerSeqID[k])

            coveragePerTuple += (windowsLength/len(R)) 
            lengthPerTuple += windowsLength
        
        
        coveragePerSubCluster[i] = coveragePerTuple/3  
        lengthPerSubCluster[i] = lengthPerTuple/3
        
        
    # save new values into the data structure
    data['average length'] = lengthPerSubCluster 
    data['coverage average']= coveragePerSubCluster
        
    return data;

In [4]:
# process tuples calculation
import networkx as nx
import itertools
KT = 3
from ipynb.fs.full.commonFunctions import *
def TupleAverage(data):
    
    tuples = data['tuples'] # read tuples create in the file
    safetyWindows = data['safety windows'] # read safety windows
    
    graphLength = [0]*len(tuples)
    graphNoise = [0]*len(tuples)
    numberCliques = [0]*len(tuples)
    graphCoverage = [0]*len(tuples)
    
    referenceLength = len(data['reference'])
    cliquesPerTuples = {}

    # For each tuple, it creates a subgraph
    for i in range(0,len(tuples)):
        currentTuple = tuples[i]
        currentGraph = nx.Graph()

        
        for j in range(0,len(currentTuple)):
            currentElement = currentTuple[j]

            windowsPerSeqID = safetyWindows[currentElement]
            for k in range(0,len(windowsPerSeqID)):
                if windowsPerSeqID[k] != "":
                    currentGraph.add_node(windowsPerSeqID[k]+" "+str(currentElement))
            
            
            # same process is done for tuples as it was done for cluster
            pairCluster = list(itertools.combinations(tuples[i],2))
            for (c1,c2) in pairCluster:

                windowsInC1 = safetyWindows[c1]
                windowsInC2 = safetyWindows[c2]
                
                for k in range(0,len(windowsInC1)):
                    for l in range(0,len(windowsInC2)):
                        matchStr = longestMatchingSubstring(windowsInC1[k],windowsInC2[l])
                        maxLength = max(len(windowsInC1[k]),len(windowsInC2[l]))
                        if len(matchStr) >= 0.5*maxLength:
                            if windowsInC1[k] != "" or windowsInC2[l] !="":
                                currentGraph.add_edge(windowsInC1[k]+" "+str(c1),windowsInC2[l]+" "+str(c2))
            
        CliquesInCluster = list(nx.find_cliques(currentGraph))
        cliquesPerTuples[i] = CliquesInCluster
        
        currentLength = 0
        cliqueCounter = 0
        noise = 0 
        for clique in CliquesInCluster:
            if len(clique) >= KT:
                cliqueCounter += 1
                readInClique = ['']*len(clique)
                for k in range(0,len(clique)):
                    readInClique[k] = clique[k].split(" ")[0]
                
                maxReadInClique = maxMatchingString(readInClique)
                
                for k in range(0,len(clique)):
                    noise += abs(len(readInClique[k]) - len(maxReadInClique))/referenceLength
                    
                currentLength += len(maxReadInClique)
        
        if cliqueCounter == 0:
            graphLength[i] = 0
            numberCliques[i] = 0
            graphNoise[i] = 0
        else:
            graphCoverage[i] = (currentLength/cliqueCounter)/referenceLength
            graphLength[i] = currentLength
            numberCliques[i] = cliqueCounter
            graphNoise[i] = noise/cliqueCounter
    
    
    data['cliques per tuples'] = cliquesPerTuples
    data['length of cliques per tuples'] = graphLength # save the length of safety windows common in the tuple
    data['number of cliques per tuples'] = numberCliques # save the coverage of safety windows common in the tuple
    data['noise of cliques per tuples'] = graphNoise
    data['coverage of cliques per tuples'] = graphCoverage
    
    return data

In [5]:
# create a safety graphs
def safetyGraph(data):
    
    nS = data['sequences per cluster']
    R = data['reference']
    intervals = data['safety windows intervals']
    safetyWindows = data['safety windows']
    
    # building graph
    
    cliques = {}
    
        
    Graph = nx.Graph()

    for s in range(1,nS):
        windowRef = safetyWindows[s]
        for k in range(0,len(windowRef)):
            Graph.add_node(windowRef[k]) # add nodes based on the safety windows in the sequence

    seqClu= [i for i in range(1,nS)]
    pairClu = list(itertools.combinations(seqClu,2))
    for (c1,c2) in pairClu:
        sizec1 = len(intervals[c1])
        sizec2 = len(intervals[c2])
        C1 = safetyWindows[c1]
        C2 = safetyWindows[c2]
        for j in range(0,sizec1):
            for k in range(0,sizec2):
                matchStr = longestMatchingSubstring(C1[j],C2[k])
                maxLength = max(len(C1[j]),len(C2[k]))
                if len(matchStr) > 0.5*maxLength:
                    Graph.add_edge(C1[j],C2[k],weight = 1)
                    


    cliques = list(nx.find_cliques(Graph))
    
    ratioLengthPerClique = [0]*len(cliques)
    ratioCoveragePerClique = [0]*len(cliques)
    ratioNoisePerClique = [0]*len(cliques)
    for k in range(0,len(cliques)):
        if len(cliques[k]) >= 0:
            emptyString = 0
            for a in cliques[k]:
                if a=="":
                    emptyString = 1
            if emptyString ==0:
                maxString = maxMatchingString(cliques[k])
                ratioCoveragePerClique[k] = sum([len(maxString)/len(element) for element in cliques[k]])/len(cliques[k])
                ratioLengthPerClique[k] = sum([len(maxString) for element in cliques[k]])/len(cliques[k])
                ratioNoisePerClique[k] = 1 - ratioCoveragePerClique[k]
    
    data['interval cliques'] = cliques
    data['ratio length'] = ratioLengthPerClique
    data['ratio coverage'] = ratioCoveragePerClique
    data['ratio noise'] = ratioNoisePerClique
    
    
    return data;

In [6]:
def subCliques(data):
    
    tupleCliques = data['cliques per tuples']
    referenceLength = len(data['reference'])
    
    cliqueNonMaxPerTuple = [0]*len(tupleCliques)
    nonMaxAverageLengthPerTuple = [0]*len(tupleCliques)
    nonMaxAverageCoveragePerTuple = [0]*len(tupleCliques)
    nonMaxAverageNoisePerTuple = [0]*len(tupleCliques)
    
    nonMaxAverageLength = 0
    nonMaxAverageCoverage = 0
    nonMaxAverageNoise = 0
    
    # calculation for cliques and overall
    
    for i in range(0,len(tupleCliques)):
        for j in range(0,len(tupleCliques[i])):
            if len(tupleCliques[i][j]) < KT:
                cliqueNonMaxPerTuple[i] += 1
        
        windowsInClique = list()
        for j in range(0,len(tupleCliques[i])):
            if len(tupleCliques[i][j]) < KT:
                for a in tupleCliques[i][j]:
                    windowsInClique.append(a.split(' ')[0])
            
        if len(windowsInClique) < KT:
            lengthInSafetyWindows = 0
            coverageInSafetyWindows = 0
            noiseInSafetyWindows = 0
            maxRead = maxMatchingString(windowsInClique)
            
            for windows in windowsInClique:
                lengthInSafetyWindows += len(windows)/(len(windowsInClique))
                nonMaxAverageLength += len(windows)
                coverageInSafetyWindows += len(windows)/referenceLength/(len(windowsInClique))
                nonMaxAverageCoverage += len(windows)/referenceLength
                noiseInSafetyWindows += (1 - coverageInSafetyWindows)/(len(windowsInClique))
                nonMaxAverageNoise += (1 - coverageInSafetyWindows)
        
        if cliqueNonMaxPerTuple[i] != 0:
            #print(cliqueNonMax[i])
            nonMaxAverageLengthPerTuple[i] = (lengthInSafetyWindows/cliqueNonMaxPerTuple[i])
            nonMaxAverageCoveragePerTuple[i] = (coverageInSafetyWindows/cliqueNonMaxPerTuple[i])
            nonMaxAverageNoisePerTuple[i] = (noiseInSafetyWindows/cliqueNonMaxPerTuple[i])
        
    
    # calculation of amount of small safety windows
    cliqueNonMaxTotal = sum(cliqueNonMaxPerTuple[i] for i in range(0,len(cliqueNonMaxPerTuple)))
    nonMaxAverageLength = nonMaxAverageLength/cliqueNonMaxTotal
    nonMaxAverageCoverage = nonMaxAverageCoverage/cliqueNonMaxTotal
    nonMaxAverageNoise = nonMaxAverageNoise/cliqueNonMaxTotal
    
    
    data['cliqueNonMaxPerTuple'] = cliqueNonMaxPerTuple
    data['cliqueNonMaxTotal'] = cliqueNonMaxTotal
    data['nonMaxAverageLengthPerTuple'] = nonMaxAverageLengthPerTuple
    data['nonMaxAverageCoveragePerTuple'] = nonMaxAverageCoveragePerTuple
    data['nonMaxAverageNoisePerTuple'] = nonMaxAverageNoisePerTuple
    data['nonMaxAverageLength'] = nonMaxAverageLength
    data['nonMaxAverageCoverage'] = nonMaxAverageCoverage
    data['nonMaxAverageNoise'] = nonMaxAverageNoise
        
    return data

In [7]:
from Bio import SeqIO
def createTuples(data,path,subcluster,k):
    
    # getting the correct order
    file = path+'/fasta/'+subcluster+'.fasta'
    cnt = 0
    
    # retrieve the sequence in the fasta file
    sequenceInFast = {}
    proteinSequence = set()
    taxonomyID = list()
    indicatorID = list()
    
    with open(file) as fp:
        for line in fp:
            if line == '\n':
                break
            linevector = line.split()
            if (">" in linevector[0])==True :
                sequenceInFast[cnt] = linevector[0].split('|')[1]
                cnt += 1
                
    fasta_sequences = SeqIO.parse(open(file),'fasta')
    for fasta in fasta_sequences:
        name, sequence,trials = fasta.id, str(fasta.seq),[piece for piece in fasta.description.split("|")[2].split(" ") if "OX=" in piece]
        taxonomyID.append(trials[0].split("=")[1])
        indicatorID.append(name.split("|")[1])
        proteinSequence.add(sequence)
        
    sequences = data['sequences ID']
    
    # choose criteria to experiment with
    escore = data['e-value full']
    escoreByFasta = {}
    for i in range(1,len(sequenceInFast)):
        index = list(sequences.keys())[list(sequences.values()).index(sequenceInFast[i])]
        escoreByFasta[i] = escore[index]
    
    escoreByFasta = dict(sorted(escoreByFasta.items(), key=lambda item: item[1]))
    
    # order the sequences in the cluster based on the criteria chosen
    arr = list(escoreByFasta.keys())
    results = [arr[i:i+k] for i in range(len(arr)-k+1)]
    
    data['tuples'] = results
    data['protein sequence'] = proteinSequence
    data['taxonomy ID'] = taxonomyID
    data['indicator ID'] = indicatorID
    
    return data