# Splice-junction Gene Sequences Data Set 


In [41]:

#Imports
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

### Load the dataset

In [42]:
df = pd.read_csv("splice.data", header=None)
df.columns = ['classlabel', 'name', 'sequence']
df.tail()


Unnamed: 0,classlabel,name,sequence
3185,N,ORAHBPSBD-NEG-2881,TCTCTTCCCTTCCCCTCTCTCTTTCTTTCTTTT...
3186,N,ORAINVOL-NEG-2161,GAGCTCCCAGAGCAGCAAGAGGGCCAGCTGAA...
3187,N,ORARGIT-NEG-241,TCTCGGGGGCGGCCGGCGCGGCGGGGAGCG...
3188,N,TARHBB-NEG-541,ATTCTACTTAGTAAACATAATTTCTTGTG...
3189,N,TARHBD-NEG-1981,AGGCTGCCTATCAGAAGGTGGTGGCTGGTG...


In [43]:
# Encoding class labels
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 0, 0, ..., 2, 2, 2])

In [44]:
# Encoding sequence
# Here we use one hot encoding to encode the character in DNA sequence. 
# So each dna sequence is converted to a 60x8 2D array 
def Seq2Vec(seq):
    s = str(seq).strip()
    CharDict = { "A":[0,0,0,0,0,0,0,1],
                 "G":[0,0,0,0,0,0,1,0],
                 "C":[0,0,0,0,0,1,0,0],
                 "T":[0,0,0,0,1,0,0,0],
                 "D":[0,0,0,1,0,0,0,0],
                 "N":[0,0,1,0,0,0,0,0],
                 "S":[0,1,0,0,0,0,0,0],
                 "R":[1,0,0,0,0,0,0,0]}
    return np.asarray([CharDict[c] for c in s], dtype=np.float32).flatten()
   
df['seqvec'] = df['sequence'].apply(Seq2Vec)
X = np.vstack(df['seqvec'].values)
print(df['sequence'][0])
print(X.shape)

               CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCCTTCGAGCCAGTCTG
(3190, 480)


In [45]:
# Split the data set into training/test set

sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)



(2552, 480) (638, 480) (2552,) (638,)


In [46]:
#DNN approach
#feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(X_train)
#dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[300, 100], n_classes=3,
#                                         feature_columns=feature_columns)
#dnn_clf.fit(x=X_train, y=y_train, batch_size=50, steps=5000)

#from sklearn.metrics import accuracy_score

#y_pred = list(dnn_clf.predict(X_test))
#accuracy = accuracy_score(y_test, y_pred)
#accuracy

In [51]:
#RNN approach
from tensorflow.contrib.layers import fully_connected

tf.reset_default_graph()

n_steps =  60
n_inputs = 8
n_neurons = 150
n_outputs = 3

X = tf.placeholder(tf.float32, shape=(None, n_steps * n_inputs), name="X")
input = tf.reshape(X, [-1, n_steps, n_inputs])
y = tf.placeholder(tf.int32, shape=(None), name="y")

basic_cell = tf.contrib.rnn.BasicRNNCell(num_units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, input, dtype=tf.float32)

logits = fully_connected(states, n_outputs, activation_fn=None)
entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)

loss = tf.reduce_mean(entropy)

optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)

correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))


def next_batch(num, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0, labels.shape[0])
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[ i] for i in idx]
    labels_shuffle = [labels[ i] for i in idx]

    return np.asarray(data_shuffle), np.asarray(labels_shuffle)


init = tf.global_variables_initializer()

n_epochs = 2000
batch_size = 100
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        X_batch, y_batch = next_batch(batch_size, X_train, y_train)
        sess.run(optimizer, feed_dict={X: X_batch, y: y_batch})
    
        if (epoch % 10 == 0):
            acc_train = accuracy.eval(feed_dict={X: X_train, y: y_train})
            acc_test = accuracy.eval(feed_dict={X: X_test, y:y_test})
            print(epoch, "Train accuracy:", acc_train,  "Test_accuracy:", acc_test)




0 Train accuracy: 0.518417 Test_accuracy: 0.517241
10 Train accuracy: 0.50431 Test_accuracy: 0.478056
20 Train accuracy: 0.559953 Test_accuracy: 0.545455
30 Train accuracy: 0.617555 Test_accuracy: 0.630094
40 Train accuracy: 0.63558 Test_accuracy: 0.617555
50 Train accuracy: 0.674373 Test_accuracy: 0.689655
60 Train accuracy: 0.716693 Test_accuracy: 0.683386
70 Train accuracy: 0.715909 Test_accuracy: 0.683386
80 Train accuracy: 0.685737 Test_accuracy: 0.675549
90 Train accuracy: 0.717085 Test_accuracy: 0.683386
100 Train accuracy: 0.771552 Test_accuracy: 0.725705
110 Train accuracy: 0.793103 Test_accuracy: 0.747649
120 Train accuracy: 0.788401 Test_accuracy: 0.76489
130 Train accuracy: 0.806035 Test_accuracy: 0.753918
140 Train accuracy: 0.812304 Test_accuracy: 0.791536
150 Train accuracy: 0.809953 Test_accuracy: 0.76489
160 Train accuracy: 0.827978 Test_accuracy: 0.816614
170 Train accuracy: 0.85815 Test_accuracy: 0.829154
180 Train accuracy: 0.868339 Test_accuracy: 0.84953
190 Train 

KeyboardInterrupt: 