# Splice-junction Gene Sequences Data Set 


In [115]:

#Imports
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

### Load the dataset

In [116]:
df = pd.read_csv("splice.data", header=None)
df.columns = ['classlabel', 'name', 'sequence']
df.tail()


Unnamed: 0,classlabel,name,sequence
3185,N,ORAHBPSBD-NEG-2881,TCTCTTCCCTTCCCCTCTCTCTTTCTTTCTTTT...
3186,N,ORAINVOL-NEG-2161,GAGCTCCCAGAGCAGCAAGAGGGCCAGCTGAA...
3187,N,ORARGIT-NEG-241,TCTCGGGGGCGGCCGGCGCGGCGGGGAGCG...
3188,N,TARHBB-NEG-541,ATTCTACTTAGTAAACATAATTTCTTGTG...
3189,N,TARHBD-NEG-1981,AGGCTGCCTATCAGAAGGTGGTGGCTGGTG...


In [117]:
# Encoding class labels
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
y

array([0, 0, 0, ..., 2, 2, 2])

In [118]:
# Encoding sequence
# Here we use one hot encoding to encode the character in DNA sequence. 
# So each dna sequence is converted to a 60x8 2D array 
def Seq2Vec(seq):
    s = str(seq).strip()
    CharDict = { "A":[0,0,0,0,0,0,0,1],
                 "G":[0,0,0,0,0,0,1,0],
                 "C":[0,0,0,0,0,1,0,0],
                 "T":[0,0,0,0,1,0,0,0],
                 "D":[0,0,0,1,0,0,0,0],
                 "N":[0,0,1,0,0,0,0,0],
                 "S":[0,1,0,0,0,0,0,0],
                 "R":[1,0,0,0,0,0,0,0]}
    return np.asarray([CharDict[c] for c in s]).flatten()
   
df['seqvec'] = df['sequence'].apply(Seq2Vec)
X = df['seqvec'].values
print(df['sequence'][0])
print(X.shape)

               CCAGCTGCATCACAGGAGGCCAGCGAGCAGGTCTGTTCCAAGGGCCTTCGAGCCAGTCTG
(3190,)


In [119]:
# Split the data set into training/test set
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = np.vstack(X[train_index]), np.vstack(X[test_index])
    y_train, y_test = np.vstack(y[train_index]), np.vstack(y[test_index])
    
print(X_train.shape)


(2552, 480)


In [120]:
import tensorflow as tf


feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(X_train)
dnn_clf = tf.contrib.learn.DNNClassifier(hidden_units=[300, 100], n_classes=3,
                                         feature_columns=feature_columns)
dnn_clf.fit(x=X_train, y=y_train, batch_size=50, steps=5000)

from sklearn.metrics import accuracy_score

y_pred = list(dnn_clf.predict(X_test))
accuracy = accuracy_score(y_test, y_pred)
accuracy

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_summary_steps': 100, '_keep_checkpoint_max': 5, '_task_type': None, '_is_chief': True, '_evaluation_master': '', '_save_checkpoints_steps': None, '_master': '', '_keep_checkpoint_every_n_hours': 10000, '_num_ps_replicas': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11c3c0710>, '_tf_random_seed': None, '_task_id': 0, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_save_checkpoints_secs': 600, '_environment': 'local'}
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and b

  equality = a == b


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/ch/l4bf90c14yjgtbj680lfrd080000gn/T/tmp6kcmb9xo/model.ckpt.
INFO:tensorflow:loss = 1.03525, step = 1
INFO:tensorflow:global_step/sec: 233.867
INFO:tensorflow:loss = 0.11482, step = 101
INFO:tensorflow:global_step/sec: 298.996
INFO:tensorflow:loss = 0.038842, step = 201
INFO:tensorflow:global_step/sec: 288.208
INFO:tensorflow:loss = 0.0166596, step = 301
INFO:tensorflow:global_step/sec: 302.995
INFO:tensorflow:loss = 0.00606049, step = 401
INFO:tensorflow:global_step/sec: 298.476
INFO:tensorflow:loss = 0.00620421, step = 501
INFO:tensorflow:global_step/sec: 295.099
INFO:tensorflow:loss = 0.00597448, step = 601
INFO:tensorflow:global_step/sec: 300.488
INFO:tensorflow:loss = 0.0058389, step = 701
INFO:tensorflow:global_step/sec: 225.242
INFO:tensorflow:loss = 0.00170447, step = 801
INFO:tensorflow:global_step/sec: 274.021
INFO:tensorflow:loss = 0.00333672, step = 901
INFO:tensorflow:glob

0.95141065830721006