# Setting up an RNA Science Environment

The computational biology field has a lot of helpful software packages for interacting with RNA sequences and experimental data. First, let's install `arnie`, a helpful utility library that simplifies interacting with various secondary structure prediction packages.

### Best paper to help orient with the current landscape of RNA modeling

https://www.nature.com/articles/s41467-021-21194-4#:~:text=Accurate%20predictions%20of%20RNA%20secondary,for%20such%20highly%20parameterized%20models.

In [1]:
import torch
from torch.utils.data import random_split

import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import re
from tqdm import tqdm

In [2]:
DATA_DIR = Path("/kaggle/input/stanford-ribonanza-rna-folding/")
TRAIN_CSV = DATA_DIR / "train_data.csv"
TRAIN_PARQUET_FILE = "train_data.parquet"
TEST_CSV = DATA_DIR / "test_sequences.csv"
TEST_PARQUET_FILE = "test_sequences.parquet"
PRED_CSV = "submission.csv"

In [3]:
def to_parquet(csv_file, parquet_file):
    dummy_df = pl.scan_csv(csv_file)

    new_schema = {}
    for key, value in dummy_df.schema.items():
        if key.startswith("reactivity"):
            new_schema[key] = pl.Float32
        else:
            new_schema[key] = value

    df = pl.scan_csv(csv_file, schema=new_schema)
    
    df.sink_parquet(
            parquet_file,
            compression='uncompressed',
            row_group_size=10,
    )
    
#to_parquet(TRAIN_CSV, TRAIN_PARQUET_FILE)
#to_parquet(TEST_CSV, TEST_PARQUET_FILE)

In [4]:
inputs_length = 457
targets_length = 177

train_df = pd.read_csv(TRAIN_CSV,nrows = 200000)

inputs = np.empty((train_df.shape[0],inputs_length))
targets = np.zeros((train_df.shape[0],inputs_length))

react_list = []
react_err_list = []

for i,k in enumerate(train_df.keys()):
    if 'reactivity' in k and 'error' not in k:
        react_list.append(i)
    elif 'reactivity_error' in k:
        react_err_list.append(i)
        
for i,row in train_df.iterrows():
    try:
        r = np.array(row[react_list].values).astype('float')

        # get sequence
        seq = row.sequence.replace('A', '1').replace('G', '2').replace('U', '3').replace('C', '4')
        seq = np.array([*seq]).astype('int')

        # store input values
        input = np.zeros(inputs_length).astype('int')
        input[:len(seq)] = seq

        # store target values
        target = np.zeros(inputs_length).astype('int')
        target[:len(r)] = np.nan_to_num(r).astype('int')
        
        inputs[i] = input
        targets[i] = target
    except ValueError as e:
        print(i,e)

In [5]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

ridge = Ridge()

In [6]:
train_x,val_x,train_y,val_y = train_test_split(inputs,targets)

In [7]:
%%time

ridge.fit(train_x,train_y)

CPU times: user 8.05 s, sys: 1.43 s, total: 9.48 s
Wall time: 3.36 s


In [8]:
test_df = pd.read_csv(TEST_CSV)

### Create submissions file. 

In [9]:
test_df.head()

Unnamed: 0,id_min,id_max,sequence_id,sequence,future
0,0,176,eee73c1836bc,GGGAACGACUCGAGUAGAGUCGAAAAUUUCCUUCCAAAUCCUGAGG...,0
1,177,353,d2a929af7a97,GGGAACGACUCGAGUAGAGUCGAAAAUGUAAUCAGAUUGCUUCUCC...,0
2,354,530,d39a4425ff45,GGGAACGACUCGAGUAGAGUCGAAAAAACACAUGAAUUUGAGGGUU...,0
3,531,707,1fc41e92d553,GGGAACGACUCGAGUAGAGUCGAAAAUCAGAGCUGGCAAAUGGAUG...,0
4,708,884,1d0826fb892f,GGGAACGACUCGAGUAGAGUCGAAAAUUUGGUAUUUGAUGCAUUAA...,0


In [10]:
N = 269796671
df_submit = pd.DataFrame(index = range(N))
df_submit.index.name = 'id'
df_submit['reactivity_DMS_MaP'] = np.zeros( N, dtype = np.float16 )
df_submit['reactivity_2A3_MaP'] = np.zeros( N, dtype = np.float16 )
print(df_submit.values.nbytes/1e6 )

1079.186684


In [11]:
test_df.shape

(1343823, 5)

In [12]:
inputs_length = 457
chunk_size = 100000

test_inputs = np.empty((test_df.shape[0],inputs_length))
        
for j in tqdm(range(0,len(test_df),100000)):
    chunk_df = test_df.iloc[j:j+chunk_size]
    for i,row in chunk_df.iterrows():
        try:
            # get sequence
            seq = row.sequence.replace('A', '1').replace('G', '2').replace('U', '3').replace('C', '4')
            seq = np.array([*seq]).astype('int')
            
            # store input values
            input = np.zeros(inputs_length).astype('int')
            input[:len(seq)] = seq

            preds = ridge.predict(input.reshape(1,-1))[0,:(row['id_max']-row['id_min']+1)]
            df_submit.loc[row['id_min']:(row['id_max']),'reactivity_DMS_MaP'] = preds
            df_submit.loc[row['id_min']:(row['id_max']),'reactivity_2A3_MaP'] = preds
        except ValueError as e:
            print(i,e)

100%|██████████| 14/14 [33:54<00:00, 145.29s/it]


In [13]:
df_submit.to_csv('submissions.csv')