In [1]:
from utility.RNAParser import RNAData
from utility.RNAQUBO import RNAQUBO
from utility.AnnealerOptimizer import Annealer
# from utility.ResultProcess import ResultParser
import time

timestamp = time.strftime("%Y%m%d-%H")

import numpy as np
import pandas as pd
import math
import os
import glob
import re

2022-09-20 09:33:58,863 dwave.cloud INFO MainThread Log level for 'dwave.cloud' namespace set to 0


# Step 1: Prepare Data

In this part, we load the folder with the raw rna data for experimentation. To evaluate the quantum solution and the actual solution include both fasta and ct files. To only generate a quantum solution, upload a fasta file.


In [2]:
# initial parameters for experiment data
s3_bucket = f"xxxx" # change to the name of bucket created in your deployment
prefix = "xxxx" # the name of the folder in the bucket

# enter folder path, for instance './rna-data'
raw_path_folder = './rna-data-test'


In [3]:
rna_data = RNAData(raw_path_folder)

data_path = rna_data.save("latest")

# print(f"You have loaded the raw rna data and saved as {data_path}. \n\
# This rna is {num_rna_length} bases long")
# #TODO inform them if it will compare or just calculate
# print()

INFO:root:finish save rna-folding_data_latest.pickle


In [4]:
rna_data.rna_files

{}

After running this block, the processed data 
will be saved as **rna-folding_bpRNA_CRW_32863_data_latest.pickle**
and **data_path** will be updated. We can see that this 
molecule has 23 bases.

# Step 2: Build Model

In this part, we build the Quadratic Unconstrained 
Binary Optimization (QUBO) model for rna folding.

First, we set the following parameters and 
initialize the RNAQUBO object. 

<center>

| Parameter | Description | Value |
|--- |--- |--- |
| PKP | pseudoknot penalty | -1.0, -0.5, 0.0, 0.5, 1.0 |
| S | penalty for short stems | 1 |
| O | penalty for overlaps | 1e6 |
| method | the method of building model | 'qc' |
| data_path | pickle file directory of rna data |  |

 </center>

In [7]:
# initial the RNAQUBO object
init_param = {}
method = ['qc']

for mt in method:
    if mt == 'qc':
        init_param[mt] = {}
        init_param[mt]['params'] = ["PKP", "S", "O"]
    
rna_qubo = RNAQUBO( data_path, method, **init_param)

INFO:root:initial qc for constructing rna QUBO
INFO:root:initial qc for constructing rna QUBO


In [8]:
# set the parameters for model
model_param = {}

method = 'qc'
model_param[method] = {}

# parameters
model_param[method]['PKP'] = [-1.0, -0.5, 0.0, 0.5, 1.0]
model_param[method]['S'] = [1]
model_param[method]['O'] = [1e6]

rna_qubo.build_models(**model_param)

0

In [9]:
# describe the model parameters
model_info = rna_qubo.describe_models()

INFO:root:method: qc
INFO:root:The model_name should be bpRNA_CRW_32863_{PKP}_{O}_{S}
INFO:root:param: PKP, value {-0.5, 0.0, 1.0, 0.5, -1.0}
INFO:root:param: S, value {1}
INFO:root:param: O, value {1000000.0}
INFO:root:method: qc
INFO:root:The model_name should be bpRNA_CRW_8500_{PKP}_{O}_{S}
INFO:root:param: PKP, value {-0.5, 0.0, 1.0, 0.5, -1.0}
INFO:root:param: S, value {1}
INFO:root:param: O, value {1000000.0}






In [11]:
# save the model
model_path = rna_qubo.save("latest")

print(f"You have built the QUBO models and saved them as rna_folding_latest.pickle")

INFO:root:finish save rna_folding_latest.pickle


You have built the QUBO models and saved them as rna_folding_latest.pickle


In [12]:
!mv rna_folding_latest.pickle rna-data/

# Step 3: Optimize Configuration

In this part, we use SA and QA to find the optimized configuration of rna folding.
At first, we load the model file using **RNAQUBO** object

In [13]:
rna_qubo_optimize = RNAQUBO.load(model_path)

FileNotFoundError: [Errno 2] No such file or directory: './rna_folding_latest.pickle'

In [9]:
model_info = rna_qubo_optimize.describe_models()

INFO:root:method: qc
INFO:root:The model_name should be bpRNA_CRW_32863_{PKP}_{O}_{S}
INFO:root:param: PKP, value {-0.5, 0.0, 1.0, 0.5, -1.0}
INFO:root:param: S, value {1}
INFO:root:param: O, value {1000000.0}
INFO:root:method: qc
INFO:root:The model_name should be bpRNA_CRW_8500_{PKP}_{O}_{S}
INFO:root:param: PKP, value {-0.5, 0.0, 1.0, 0.5, -1.0}
INFO:root:param: S, value {1}
INFO:root:param: O, value {1000000.0}






In [10]:
# get the model you want to optimize
rna_name = 'bpRNA_CRW_32863'
PKP = 0.5
O = 1e6
S = 1
method = 'qc'

model_name = "{}_{}_{}_{}".format(rna_name, PKP, O, S)

qubo_model = rna_qubo_optimize.get_model(rna_name, method, model_name)

After we get the qubo model, we need to set the parameters for optimization: 
| Parameter | Description | Value |
|--- |--- |--- |
|method | annealing method for QUBO problem |'dwave-sa': use the simulated annealer in ocean toolkit<br> 'dwave-qa': use the quantum annealer|
|shots| number of reads, refer to [dwave-sa](https://docs.ocean.dwavesys.com/projects/neal/en/latest/reference/generated/neal.sampler.SimulatedAnnealingSampler.sample.html#neal.sampler.SimulatedAnnealingSampler.sample) and [dwave-qa](https://amazon-braket-ocean-plugin-python.readthedocs.io/en/latest/_apidoc/braket.ocean_plugin.braket_sampler.html) for details |1 to 10,000|
|bucket | the s3 bucket to store your results | - |
|prefix | the name of the folder in your s3 bucket | - |
|device | the arn name to run your quantum annealing| 'arn:aws:braket:::device/qpu/d-wave/Advantage_system4' <br> 'arn:aws:braket:::device/qpu/d-wave/DW_2000Q_6'|

Then, we can run the SA for this problem:

In [11]:
method = 'neal-sa'

optimizer_param = {}
optimizer_param['shots'] = 1000

sa_optimizer = Annealer(qubo_model, method, **optimizer_param)

INFO:root:use neal simulated annealer (c++) from dimod


In [12]:
sa_optimize_result = sa_optimizer.fit()

INFO:root:fit() ...


ValueError: diag requires an array of at least two dimensions

In [2]:
import matplotlib.pyplot as plt
import forgi.visual.mplotlib as fvm
import forgi.graph.bulge_graph as fgb
import forgi 

In [11]:
def parse_pseudoknot(ctList):
    """
    ctList              -- paired-bases: [(3, 8), (4, 7)]
    
    Parse pseusoknots from clList
    Return:
        [ [(3, 8), (4, 7)], [(3, 8), (4, 7)], ... ]
    """
    ctList.sort(key=lambda x:x[0])
    ctList = [ it for it in ctList if it[0]<it[1] ]
    paired_bases = set()
    for lb,rb in ctList:
        paired_bases.add(lb)
        paired_bases.add(rb)
    
    # Collect duplex
    duplex = []
    cur_duplex = [ ctList[0] ]
    for i in range(1, len(ctList)):
        bulge_paired = False
        for li in range(ctList[i-1][0]+1, ctList[i][0]):
            if li in paired_bases:
                bulge_paired = True
                break
        if ctList[i][1]+1>ctList[i-1][1]:
            bulge_paired = True
        else:
            for ri in range(ctList[i][1]+1, ctList[i-1][1]):
                if ri in paired_bases:
                    bulge_paired = True
                    break
        if bulge_paired:
            duplex.append(cur_duplex)
            cur_duplex = [ ctList[i] ]
        else:
            cur_duplex.append(ctList[i])
    if cur_duplex:
        duplex.append(cur_duplex)
    
    # Discriminate duplex are pseudoknot
    Len = len(duplex)
    incompatible_duplex = []
    for i in range(Len):
        for j in range(i+1, Len):
            bp1 = duplex[i][0]
            bp2 = duplex[j][0]
            if bp1[0]<bp2[0]<bp1[1]<bp2[1] or bp2[0]<bp1[0]<bp2[1]<bp1[1]:
                incompatible_duplex.append((i, j))
    
    pseudo_found = []
    while incompatible_duplex:
        # count pseudo
        count = {}
        for l,r in incompatible_duplex:
            count[l] = count.get(l,0)+1
            count[r] = count.get(r,0)+1
        
        # find most possible pseudo
        count = list(count.items())
        count.sort( key=lambda x: (x[1],-len(duplex[x[0]])) )
        possible_pseudo = count[-1][0]
        pseudo_found.append(possible_pseudo)
        i = 0
        while i<len(incompatible_duplex):
            l,r = incompatible_duplex[i]
            if possible_pseudo in (l,r):
                del incompatible_duplex[i]
            else:
                i += 1
    
    pseudo_duplex = []
    for i in pseudo_found:
        pseudo_duplex.append(duplex[i])
    
    return pseudo_duplex

In [10]:
def ct2dot(ctList, length):
    """
    ctList              -- paired-bases: [(3, 8), (4, 7)]
    length              -- Length of structure
    
    Convert ctlist structure to dot-bracket
    [(3, 8), (4, 7)]  => ..((..))..
    """
    dot = ['.']*length
    if len(ctList) == 0:
        return "".join(dot)
    ctList = sorted(ctList, key=lambda x:x[0])
    ctList = [ it for it in ctList if it[0]<it[1] ]
    pseudo_duplex = parse_pseudoknot(ctList)
    for l,r in ctList:
        dot[l-1] = '('
        dot[r-1] = ')'
    dottypes = [ '<>', r'{}', '[]' ]
    if len(pseudo_duplex)>len(dottypes):
        print("Warning: too many psudoknot type: %s>%s" % (len(pseudo_duplex),len(dottypes)))
    for i,duplex in enumerate(pseudo_duplex):
        for l,r in duplex:
            dot[l-1] = dottypes[i%3][0]
            dot[r-1] = dottypes[i%3][1]
    return "".join(dot)

In [9]:
ct2dot([(3,8),(4,7)],10)

'..((..))..'

In [1]:
with open('./rna-data/bpRNA_CRW_32863.ct.txt') as file:
    lines = file.readlines()
    
with open('./rna-data/bpRNA_CRW_32863.fasta.txt') as file:
    fasta_lines = file.readlines()
    
    
rna = fasta_lines[1]
    
stems_actual = []

sip = False                       # stem in progress?
sl = 0                            # stem length
last_line = [0, 0, 0, 0, 0, 0]    # initiate last line

for i in range(0, len(lines)):
    line = lines[i].strip().split()
    print(line)
    
    if (int(line[4]) != 0 and sip == False):
        sip = True
        temp = [int(line[0]), int(line[4])]
        print(f"temp is {temp}")
        if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('C' or 'c')) or (rna[i] == ('C' or 'c') and rna[int(line[4])-1] == ('G' or 'g')):
            sl += 3
        if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('G' or 'g')) or (rna[i] == ('A' or 'a') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('A' or 'a')):
            sl += 2
    if (int(line[4]) != 0 and sip == True and (int(last_line[4])-int(line[4]) == 1)):
        if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('C' or 'c')) or (rna[i] == ('C' or 'c') and rna[int(line[4])-1] == ('G' or 'g')):
            sl += 3
        if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('G' or 'g')) or (rna[i] == ('A' or 'a') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('A' or 'a')):
            sl += 2
    if (int(line[4]) == 0 and sip == True):
        sip = False
        temp.append(sl)
        if temp[1] > temp[0]:
            stems_actual.append(temp)
        sl = 0
    if ((int(last_line[4])-int(line[4]) != 1) and int(last_line[4]) != 0  and sip == True):
        temp.append(sl)
        if temp[1] > temp[0]:
            stems_actual.append(temp)
        temp = [int(line[0]), int(line[4])]
        sl = 0
        if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('C' or 'c')) or (rna[i] == ('C' or 'c') and rna[int(line[4])-1] == ('G' or 'g')):
            sl = 3
        if (rna[i] == ('G' or 'g') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('G' or 'g')) or (rna[i] == ('A' or 'a') and rna[int(line[4])-1] == ('U' or 'u')) or (rna[i] == ('U' or 'u') and rna[int(line[4])-1] == ('A' or 'a')):
            sl = 2

    last_line = line

['1', 'G', '0', '2', '0', '1']
['2', 'C', '1', '3', '0', '2']
['3', 'G', '2', '4', '0', '3']
['4', 'G', '3', '5', '0', '4']
['5', 'G', '4', '6', '0', '5']
['6', 'U', '5', '7', '0', '6']
['7', 'A', '6', '8', '0', '7']
['8', 'U', '7', '9', '0', '8']
['9', 'A', '8', '10', '0', '9']
['10', 'G', '9', '11', '0', '10']
['11', 'U', '10', '12', '23', '11']
temp is [11, 23]
['12', 'U', '11', '13', '22', '12']
['13', 'U', '12', '14', '21', '13']
['14', 'A', '13', '15', '0', '14']
['15', 'G', '14', '16', '0', '15']
['16', 'U', '15', '17', '0', '16']
['17', 'G', '16', '18', '0', '17']
['18', 'G', '17', '19', '0', '18']
['19', 'U', '18', '20', '0', '19']
['20', 'A', '19', '21', '0', '20']
['21', 'A', '20', '22', '13', '21']
temp is [21, 13]
['22', 'A', '21', '23', '12', '22']
['23', 'A', '22', '24', '11', '23']


In [2]:
stems_actual

[[11, 23, 6]]

In [13]:
rna

'GCGGGUAUAGUUUAGUGGUAAAA'

In [4]:
rna_len = len(lines)

In [12]:
pair_list = []
for stem in stems_actual:
    pair_list.append(tuple(stem[0:2]))

ct2dot(pair_list, rna_len)

'..........(...........)'

In [17]:
import matplotlib.pyplot as plt
import forgi.visual.mplotlib as fvm
import forgi.graph.bulge_graph as fgb
import forgi

In [18]:
bg = fgb.BulgeGraph.from_fasta_text(""">blah
... AAAACCGGGCCUUUUACCCCAAAUUGGAA
... ((((..(((..)))..))))...((..))
... """)

In [19]:
# cg = forgi.load_rna("examples/input/1y26.fx", allow_many=False)

fvm.plot_rna(bg, text_kwargs={"fontweight":"black"}, lighten=0.7,
             backbone_kwargs={"linewidth":3})
plt.show()

ModuleNotFoundError: No module named 'RNA'