In [379]:
# from utility.MoleculeParser import MoleculeData
# from utility.QMUQUBO import QMUQUBO
# from utility.AnnealerOptimizer import Annealer
# from utility.ResultProcess import ResultParser
# import time

# timestamp = time.strftime("%Y%m%d-%H")

# import packages, including those to connect with AWS BraKet

import numpy as np
import pandas as pd
import math
import os
import glob

from braket.aws import AwsDevice
from braket.ocean_plugin import BraketSampler, BraketDWaveSampler

# Step 1: Prepare Data

In [380]:
# function to read in .ct file and give a list of known structure stems:

def actual_stems(seq_ss, seq_ps):
    
    with open(subdirectory+"/"+seq_ss) as file:
        lines = file.readlines()
    
    with open(subdirectory+"/"+seq_ps) as file:
        fasta_lines = file.readlines()
    
    rna = fasta_lines[1]
    
    stems_actual = []

    sip = False                       # stem in progress?
    sl = 0                            # stem length
    last_line = [0, 0, 0, 0, 0, 0]    # initiate last line

    for i in range(0, len(lines)):     
        line = lines[i].strip().split()

        if (int(line[4]) != 0 and sip == False):
            sip = True
            temp = [int(line[0]), int(line[4])]
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('C' or 'c')) or (rna[i] == ('C' or 'c') and rna[int(line[4])-1] == ('G' or 'g')):
                sl += 3
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('G' or 'g')) or (rna[i] == ('A' or 'a') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('A' or 'a')):
                sl += 2
        if (int(line[4]) != 0 and sip == True and (int(last_line[4])-int(line[4]) == 1)):
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('C' or 'c')) or (rna[i] == ('C' or 'c') and rna[int(line[4])-1] == ('G' or 'g')):
                sl += 3
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('G' or 'g')) or (rna[i] == ('A' or 'a') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('A' or 'a')):
                sl += 2
        if (int(line[4]) == 0 and sip == True):
            sip = False
            temp.append(sl)
            if temp[1] > temp[0]:
                stems_actual.append(temp)
            sl = 0
        if ((int(last_line[4])-int(line[4]) != 1) and int(last_line[4]) != 0  and sip == True):
            temp.append(sl)
            if temp[1] > temp[0]:
                stems_actual.append(temp)
            temp = [int(line[0]), int(line[4])]
            sl = 0
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('C' or 'c')) or (rna[i] == ('C' or 'c') and rna[int(line[4])-1] == ('G' or 'g')):
                sl = 3
            if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('G' or 'g')) or (rna[i] == ('A' or 'a') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('A' or 'a')):
                sl = 2
        
        last_line = line
        
    return stems_actual

In [381]:
# function to generate list of potential stem pairs that form pseudoknots:

def potential_pseudoknots(stems_potential, pkp):

    pseudoknots_potential = []
    pseudoknot_penalty = pkp

    for i in range(len(stems_potential)):
        for j in range(i + 1, len(stems_potential)):
            
            stem1 = stems_potential[i]
            stem2 = stems_potential[j]
    
            i_a = stem1[0]
            j_a = stem1[1]
            i_b = stem2[0]
            j_b = stem2[1]
    
            pseudoknot = [i,j,1]
    
            if (i_a < i_b and i_b < j_a and j_a < j_b) or (i_b < i_a and i_a < j_b and j_b < j_a):
        
                pseudoknot[2] = pseudoknot_penalty
    
            pseudoknots_potential.append(pseudoknot)
            
    return pseudoknots_potential

In [382]:
# function to generate list of stem pairs that overlap:

def potential_overlaps(stems_potential):
    
    overlaps_potential = []
    overlap_penalty = 1e6

    for i in range(len(stems_potential)):
        for j in range(i+1, len(stems_potential)):
    
            stem1 = stems_potential[i]
            stem2 = stems_potential[j]
    
            overlap = [i, j, 0]
    
            stem1_cspan1 = set(range(stem1[1]-int(stem1[2])+1, stem1[1]+1))
            stem2_cspan1 = set(range(stem2[1]-int(stem2[2])+1, stem2[1]+1))
            
            stem1_cspan2 = set(range(stem1[0], stem1[0]+int(stem1[2])))
            stem2_cspan2 = set(range(stem2[0], stem2[0]+int(stem2[2])))
    
            if (len(stem1_cspan1 & stem2_cspan1) != 0) or (len(stem1_cspan2 & stem2_cspan2) != 0)  or (len(stem1_cspan1 & stem2_cspan2) != 0) or (len(stem1_cspan2 & stem2_cspan1) != 0):
        
                overlap[2] = overlap_penalty
        
            overlaps_potential.append(overlap)
            
    return overlaps_potential

In [383]:
# function to read in .fasta file and generate list of potential stems at least 3 base-pairs long:

def potential_stems(seq_ps):
    
    with open(subdirectory+"/"+seq_ps) as file:
        lines = file.readlines()
    
    rna = lines[1]
    
    matrix = np.zeros((len(rna),len(rna)))

    for diag in range(0, len(matrix)):
        for row in range(0, len(matrix)-diag):
            col = row + diag
            base1 = rna[row]
            base2 = rna[col]
            if row != col:
                if ((base1 == ("A" or "a")) and (base2 == ("U" or "u"))) or ((base1 == ("U" or "u")) and (base2 == ("A" or "a"))) or ((base1 == ("G" or "g")) and (base2 == ("U" or "u"))) or ((base1 == ("U" or "u")) and (base2 == ("G" or "g"))):
                    matrix[row][col] = 2
                if ((base1 == ("G" or "g")) and (base2 == ("C" or "c"))) or ((base1 == ("C" or "c")) and (base2 == ("G" or "g"))):
                    matrix[row][col] = 3
    stems_potential = []
    mu = 0

    for row in range(0, len(matrix)):
        for col in range (row, len(matrix)):
            if row != col:
                if matrix[row][col] != 0:
                    temp_row = row
                    temp_col = col
                    stem = [row+1,col+1,0]
                    length_N = 0
                    length_H = 0
                    while (matrix[temp_row][temp_col] != 0) and (temp_row != temp_col):
                        length_N+=1
                        length_H+=matrix[temp_row][temp_col]
                        temp_row+=1
                        temp_col-=1
                        if length_N >= 3:
                            stem[2] = int(length_H)
                            stems_potential.append(stem.copy())
                    if length_H > mu:
                        mu = length_H
    
    return [stems_potential, mu, rna, len(rna)]

# Step 2: Build Model

In [384]:
# function to evaluate the energy of the known structure under the model Hamiltonian:

def energy(stems_actual, pkp):
    
    cl = 1
    cb = 1
    k = 0
    
    pseudoknots_actual = potential_pseudoknots(stems_actual, pkp)
    cost = 0
    mu = max(list(map(list, zip(*stems_actual)))[2])
    
    for i in range(0, len(stems_actual)):
        cost += cl*((stems_actual[i][2]**2)-2*mu*stems_actual[i][2]+mu**2)-cb*(stems_actual[i][2]**2)
        for j in range(i+1, len(stems_actual)):
            cost -= 2*cb*stems_actual[i][2]*stems_actual[j][2]*pseudoknots_actual[k][2]
            k += 1
    
    return cost

In [385]:
# function to generate the Hamiltonian of a given RNA structure from potential stems, overlaps, and pseudoknots:

def model(stems_potential, pseudoknots_potential, overlaps_potential, mu):
    
    L = {}
    Q = {}
    cl = 1
    cb = 1
    k = 0

    for i in range(0, len(stems_potential)):
        L[str(i)] = cl*((stems_potential[i][2]**2)-2*mu*stems_potential[i][2]+mu**2)-cb*(stems_potential[i][2]**2)
        for j in range(i+1, len(stems_potential)):
            Q[(str(i), str(j))] = -2*cb*stems_potential[i][2]*stems_potential[j][2]*pseudoknots_potential[k][2]+overlaps_potential[k][2]
            k += 1
    
    return L, Q

# Step 3: Optimize Configuration

In [386]:
pk = ["woutPKs"]
s  = ["s", "m", "l"]
p  = [-1.0, -0.5, 0.0, 0.5, 1.0]
pl = ["n1", "np5", "0", "pp5", "p1"]

data = []

In [387]:
def fileSorter(file):
    num = False
    numStart = 0
    numEnd = 0
    for i in range(len(file)):
        if file[i].isnumeric():
            if not num:
                numStart = i
                num = True
        elif num:
            numEnd = i
            num=False
    return (file[numStart:numEnd+1])

for a in range(0, len(pk)): 
    for b in range(0, len(s)):
        subdirectory = './known_structures/'+pk[a]+'/'+s[b]

        files=os.listdir(subdirectory)

        files.sort(key=fileSorter)

        ct = [f for f in files if f.endswith('.ct.txt')]
        fasta = [f for f in files if f.endswith('.fasta.txt')]

        for c in range(0, len(pl)):
            bprna_id = []
            size = []
            pks = []
            pk_penalty = []

            penalty = pl[c]

            # for i in range(0, len(ct)):
            #     bprna_id.append(ct[i].split('.')[0])
            #     size.append(subdirectory.split("/")[3])
            #     if subdirectory.split("/")[2] == "wPKs":
            #         pks.append("T")
            #     else:
            #         pks.append("F")
            #     pk_penalty.append(penalty)
            # print(pks)

            stems_a    = []
            energies_a = []
            stems_p    = []
            pks_p      = []
            ols_p      = []
            models     = []

            for index in range(0, len(ct)):
                stems_a.append(actual_stems(ct[index], fasta[index]))
                energies_a.append(energy(stems_a[index], p[c]))
                stems_p.append(potential_stems(fasta[index]))
                pks_p.append(potential_pseudoknots(stems_p[index][0], p[c]))
                ols_p.append(potential_overlaps(stems_p[index][0]))
                models.append(model(stems_p[index][0], pks_p[index], ols_p[index], stems_p[index][1]))
                                
            print(models)
            problem = []
            print(pk_penalty)

yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
yes
[({'0': -32.0, '1': 0.0, '2': -96.0, '3': 0.0, '4': -64.0, '5': -160.0, '6': -32.0, '7': -32.0, '8': 0.0, '9': -96.0, '10': 0.0, '11': -96.0, '12': 0.0, '13': 0.0, '14': -32.0, '15': 0.0, '16': 0.0, '17': -32.0, '18': -32.0, '19': -32.0, '20': -32.0, '21': 0.0, '22': -96.0, '23': 0.0, '24': -96.0, '25': -192.0, '26': -256.0, '27': 0.0, '28': 0.0, '29': -96.0, '30': -160.0, '31': -32.0, '32': 0.0, '33': 0.0, '34': -64.0, '35': 0.0, '36': 0.0, '37': 0.0, '38': 0.0, '39': -32.0, '40': -32.0, '41': 0.0, '42': 0.0}, {('0', '1'): 999856.0, ('0', '2'): 999802.0, ('0', '3'): 999856.0, ('0', '4'): 999820.0, ('0', '5'): 999766.0, ('0', '6'): 1000162.0, ('0', '7'): 1000162.0, ('0', '8'): 1000144.0, ('0', '9'): 1000198.0, ('0', '10'): 1000144.0, ('0', '11'): 1000198.0, ('0', '12'): 1000144.0, ('0', '13'): 1000144.0, ('0', '14'): 999838.0, ('0', '15'): 1000144.0, (

KeyboardInterrupt: 

# Step 4: PostProcess Result