<a href="https://colab.research.google.com/github/atfrank/CS-Annotate/blob/master/Structural_Annotation_of_RNA_Using_NMR_Chemical_Shifts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Structural Annotation of RNA Using NMR Chemical Shifts

# Initialization

In [6]:
%%capture
## Import Module
import pandas as pd
import numpy as np
import io
import pandas as pd
import io
import requests
import deepchem as dc
import tensorflow as tf

In [7]:
NUMBER_CHEMICAL_SHIFT_TYPE = 19
RETAIN = ['id', 'resid', 'resname', 'sasa-All-atoms', 'sasa-Total-Side', 'sasa-Main-Chain', 'sasa-Non-polar', 'sasa-All', 'sasa', 'syn_anti', 'astack', 'nastack', 'pair', 'pucker', 'class']
RETAIN_NONAME = ['id', 'resid', 'sasa-All-atoms', 'sasa-Total-Side', 'sasa-Main-Chain', 'sasa-Non-polar', 'sasa-All', 'sasa', 'syn_anti', 'astack', 'nastack', 'pair', 'pucker', 'class']
from CSRNA import *

# Train Model

## Load data and preprocess

In [10]:
neighbors = 3
# load train and test data
X_train = pd.read_csv(DIR_PATH+"train_features_"+str(neighbors)+".csv",delim_whitespace=True,header=0)
y_train = pd.read_csv(DIR_PATH+"train_target_"+str(neighbors)+".csv",delim_whitespace=True,header=0)
X_test = pd.read_csv(DIR_PATH+"test_features_"+str(neighbors)+".csv",delim_whitespace=True,header=0)
y_test = pd.read_csv(DIR_PATH+"test_target_"+str(neighbors)+".csv",delim_whitespace=True,header=0)
targets = y_train.columns

# Convert to Deepchem Dataset
w = np.ones(y_train.shape[0]) 
train_w = np.vstack([w, w, w, w, w, w, w, w, w, w, w]).T 
train_dataset = dc.data.NumpyDataset(X_train, y_train, train_w) 

w = np.ones(y_test.shape[0]) # number of samples in test
test_w = np.vstack([w, w, w, w, w, w, w, w, w, w, w]).T # weight is 1 
test_dataset = dc.data.NumpyDataset(X_test, y_test, test_w) # use deepchem here, some kind of weight

# Scale Dataset
transform_scaler = dc.trans.transformers.NormalizationTransformer(transform_X = True, transform_y = False, dataset=train_dataset)
train_dataset_norm = transform_scaler.transform(train_dataset)
test_dataset_norm = transform_scaler.transform(test_dataset)

# Balance Dataset
transform_balancer = dc.trans.transformers.BalancingTransformer(transform_w = True, dataset=train_dataset_norm)
train_dataset_balanced = transform_balancer.transform(train_dataset_norm)

n_features = train_dataset_balanced.X.shape[1]
n_tasks = train_dataset_balanced.y.shape[1]

## Fit and Save Model

In [None]:
model = dc.models.ProgressiveMultitaskClassifier(n_tasks=n_tasks,n_features=n_features,
                                                 layer_sizes=[100],
                                                 alpha_init_stddevs=0.04,
                                                 learning_rate=0.001, 
                                                 model_dir=DIR_PATH+'model/', 
                                                 tensorboard=True, 
                                                 use_queue=False)
model.fit(train_dataset_balanced, nb_epoch=50)
model.get_checkpoints()

# Test Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
import matplotlib.cm as cm

colors = cm.rainbow(np.linspace(0, 1, len(targets)))
testpred = model.predict(test_dataset_norm)
plt.figure()
lw = 3

# get TPR and FPR
for i,target in enumerate(targets):
  fpr, tpr, thresholds = roc_curve(test_dataset.y[:, i], testpred[:, i, 1].flatten())
  roc_auc = auc(fpr, tpr)
  # Make plot
  plt.plot(fpr, tpr, color=colors[i],lw=lw, label='%s (area = %0.2f)' % (target, roc_auc))

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc=(1.1,0))
plt.show()

predictions = pd.DataFrame(testpred[:, :, 1], columns=["p"+i for i in targets])
predictions = pd.concat([info, actuals, predictions], axis=1)
predictions.to_csv('predictions.txt', sep = ' ')
predictions.tail()