# Splice-junction Gene Sequences Data Set 


In [25]:

#Imports
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

### Load the dataset

In [26]:
df = pd.read_csv("splice.data", header=None)
df.columns = ['classlabel', 'name', 'sequence']
df.tail()


Unnamed: 0,classlabel,name,sequence
3185,N,ORAHBPSBD-NEG-2881,TCTCTTCCCTTCCCCTCTCTCTTTCTTTCTTTT...
3186,N,ORAINVOL-NEG-2161,GAGCTCCCAGAGCAGCAAGAGGGCCAGCTGAA...
3187,N,ORARGIT-NEG-241,TCTCGGGGGCGGCCGGCGCGGCGGGGAGCG...
3188,N,TARHBB-NEG-541,ATTCTACTTAGTAAACATAATTTCTTGTG...
3189,N,TARHBD-NEG-1981,AGGCTGCCTATCAGAAGGTGGTGGCTGGTG...


In [27]:
# Encoding class labels
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 0, 0, ..., 2, 2, 2])

In [28]:
# Encoding sequence
# Here we use one hot encoding to encode the character in DNA sequence. 
# So each dna sequence is converted to a 60x8 2D array 
def Seq2Vec(seq):
    s = str(seq).strip()
    CharDict = { "A":[0,0,0,0,0,0,0,1],
                 "G":[0,0,0,0,0,0,1,0],
                 "C":[0,0,0,0,0,1,0,0],
                 "T":[0,0,0,0,1,0,0,0],
                 "D":[0,0,0,1,0,0,0,0],
                 "N":[0,0,1,0,0,0,0,0],
                 "S":[0,1,0,0,0,0,0,0],
                 "R":[1,0,0,0,0,0,0,0]}
    return np.asarray([CharDict[c] for c in s], dtype=np.float32).flatten()
   
df['seqvec'] = df['sequence'].apply(Seq2Vec)
X = np.vstack(df['seqvec'].values)
print(df['sequence'][0])
print(X.shape)

               CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCCTTCGAGCCAGTCTG
(3190, 480)


In [30]:
# Split the data set into training/test set

sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)



(2552, 480) (638, 480) (2552,) (638,)
2552


In [33]:
#DNN approach
#feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(X_train)
#dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[300, 100], n_classes=3,
#                                         feature_columns=feature_columns)
#dnn_clf.fit(x=X_train, y=y_train, batch_size=50, steps=5000)

#from sklearn.metrics import accuracy_score

#y_pred = list(dnn_clf.predict(X_test))
#accuracy = accuracy_score(y_test, y_pred)
#accuracy

In [None]:
#CNN approach
conv1_depth = 32
conv2_depth = 64
conv1_kernel_size = [3, 3]
conv2_kernel_size = [3, 3]


X = tf.placeholder(tf.float32, shape=(None, 480), name="X")
input = tf.reshape(X, [-1, 60, 8, 1])
y = tf.placeholder(tf.int32, shape=(None), name="y")

conv1 = tf.layers.conv2d(
      inputs=input,
      filters=conv1_depth,
      kernel_size=conv1_kernel_size,
      padding="same",
      activation=tf.nn.relu)

pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

  # Convolutional Layer #2
  # Computes 64 features using a 3x3 filter.
  # Padding is added to preserve width and height.
  # Input Tensor Shape: [batch_size, 30, 4, 16]
  # Output Tensor Shape: [batch_size, 30, 4, 32]
conv2 = tf.layers.conv2d(
      inputs=pool1,
      filters=conv2_depth,
      kernel_size=conv2_kernel_size,
      padding="same",
      activation=tf.nn.relu)

pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

pool2_flat = tf.reshape(pool2, [-1, 15 * 2 * conv2_depth])

dense1 = tf.layers.dense(inputs=pool2_flat, units=64, activation=tf.nn.relu)

dense2 = tf.layers.dense(inputs=dense1, units=32, activation=tf.nn.relu)

logits = tf.layers.dense(inputs=dense2, units=3)

entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)

loss = tf.reduce_mean(entropy)

optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(loss)

correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))


init = tf.global_variables_initializer()


def next_batch(num, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0, labels.shape[0])
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[ i] for i in idx]
    labels_shuffle = [labels[ i] for i in idx]

    return np.asarray(data_shuffle), np.asarray(labels_shuffle)


n_epochs = 100
batch_size = 100
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(y_train.shape[0] // batch_size):
            X_batch, y_batch = next_batch(batch_size, X_train, y_train)
            sess.run(optimizer, feed_dict={X: X_batch, y: y_batch})

        acc_train = accuracy.eval(feed_dict={X: X_train, y: y_train})
        acc_test = accuracy.eval(feed_dict={X: X_test, y:y_test})
        print(epoch, "Train accuracy:", acc_train,  "Test_accuracy:", acc_test)



0 Train accuracy: 0.518809 Test_accuracy: 0.518809
1 Train accuracy: 0.518809 Test_accuracy: 0.518809
2 Train accuracy: 0.518809 Test_accuracy: 0.518809
3 Train accuracy: 0.537618 Test_accuracy: 0.537618
4 Train accuracy: 0.65674 Test_accuracy: 0.645768
5 Train accuracy: 0.692006 Test_accuracy: 0.680251
6 Train accuracy: 0.690047 Test_accuracy: 0.670846
7 Train accuracy: 0.713166 Test_accuracy: 0.689655
8 Train accuracy: 0.762931 Test_accuracy: 0.760188
9 Train accuracy: 0.784091 Test_accuracy: 0.778997
10 Train accuracy: 0.823668 Test_accuracy: 0.816614
11 Train accuracy: 0.833072 Test_accuracy: 0.824451
12 Train accuracy: 0.851097 Test_accuracy: 0.835423
13 Train accuracy: 0.864028 Test_accuracy: 0.840125
14 Train accuracy: 0.887539 Test_accuracy: 0.863636
15 Train accuracy: 0.90047 Test_accuracy: 0.877743
16 Train accuracy: 0.903213 Test_accuracy: 0.880878
17 Train accuracy: 0.92594 Test_accuracy: 0.893417
18 Train accuracy: 0.926332 Test_accuracy: 0.899687
19 Train accuracy: 0.9404