In [443]:
#https://archive.ics.uci.edu/ml/datasets/Molecular+Biology+(Promoter+Gene+Sequences)

import numpy as np
import pandas as pd

In [444]:
#Data processing

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
names = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names = names, sep='\t')

In [445]:
sequences=data['Sequence']
classes=data['Class']

In [446]:
classes = data['Class']
y_prev=[]
y=[]

In [447]:
for i in np.arange(0,classes.shape[0],1):
    y_prev.append(classes[i][0])
    if y_prev[i]=='+':
        y.append(1)
    else:
        y.append(0)

In [448]:
y=np.array(y)

In [449]:
sequences[3]=data['id'][3]
sequences[4]=data['id'][4]
sequences[5]=data['id'][5]
sequences[9]=data['id'][9]
sequences[10]=data['id'][10]
sequences[11]=data['id'][11]
sequences[12]=data['id'][12]
sequences[13]=data['id'][13]
sequences[14]=data['id'][14]
sequences[15]=data['id'][15]
sequences[16]=data['id'][16]
sequences[23]=data['id'][23]
sequences[27]=data['id'][27]
sequences[28]=data['id'][28]
sequences[30]=data['id'][30]
sequences[31]=data['id'][31]
sequences[32]=data['id'][32]
sequences[35]=data['id'][35]
sequences[40]=data['id'][40]
sequences[42]=data['id'][42]
sequences[44]=data['id'][44]
sequences[45]=data['id'][45]
sequences[48]=data['id'][48]
sequences[49]=data['id'][49]
sequences[52]=data['id'][52]

lines=np.array(sequences)

In [450]:
# function to convert a DNA sequence string to a numpy array
# converts to lower case, changes any non 'acgt' characters to 'n'
#https://www.kaggle.com/thomasnelson/working-with-dna-sequence-data-for-ml



import numpy as np
import re
def string_to_array(my_string):
    my_string = my_string.lower()
    my_string = re.sub('[^acgt]', 'z', my_string)
    my_array = np.array(list(my_string))
    return my_array

# create a label encoder with 'acgtn' alphabet
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(np.array(['a','c','g','t','z']))

def ordinal_encoder(my_array):
    integer_encoded = label_encoder.transform(my_array)
    float_encoded = integer_encoded.astype(float)
    float_encoded[float_encoded == 0] = 0.25 # A
    float_encoded[float_encoded == 1] = 0.50 # C
    float_encoded[float_encoded == 2] = 0.75 # G
    float_encoded[float_encoded == 3] = 1.00 # T
    float_encoded[float_encoded == 4] = 0.00 # anything else, z
    return float_encoded

X=[ordinal_encoder(string_to_array(i)) for i in lines]
X=np.array(X)

In [451]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=6)

In [452]:
#SVM

import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [453]:
from sklearn.svm import SVC 
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [454]:
test_predictions=(clf.predict(X_test))
test_accuracy = np.mean(test_predictions==y_test)

In [455]:
test_accuracy*100

69.76744186046511

In [456]:
#Convolutional Neural network

import numpy as np
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))


import keras

NUM_CLASSES = 2
# convert class labels to one-hot encoded, should have shape (?, NUM_CLASSES)
y_train2 = keras.utils.to_categorical(y_train)
y_test2 = keras.utils.to_categorical(y_test)
# import necessary building blocks
from keras.models import Sequential
from keras.layers import Conv1D,  Flatten, Dense, Activation, Dropout,BatchNormalization,LSTM
from keras.layers.advanced_activations import LeakyReLU

In [457]:
def make_model():
    """
    Define your model architecture here.
    Returns `Sequential` model.
    """
    model = Sequential()
    model.add(Conv1D(input_shape=(1,57),padding="same",kernel_size=3,filters=16))
    model.add(LeakyReLU(0.1))
    model.add(BatchNormalization())
    
    model.add(Conv1D(padding="same",kernel_size=3,filters=32))
    model.add(LeakyReLU(0.1))
    model.add(BatchNormalization())
    

    model.add(Dropout(0.25))
    model.add(Conv1D(padding="same",kernel_size=3,filters=32))
    model.add(LeakyReLU(0.1))
    model.add(BatchNormalization())
    
    model.add(Conv1D(padding="same",kernel_size=3,filters=64))
    model.add(LeakyReLU(0.1))
    model.add(BatchNormalization())
    
    model.add(Dropout(0.25))
    model.add(Flatten())
    
    model.add(Dense(256))
    model.add(LeakyReLU(0.1))
    model.add(Dropout(0.5))
    model.add(Dense(2))
    model.add(LeakyReLU(0.1))
    
    model.add(Activation("softmax"))
    

   
    
    return model

In [458]:
model = make_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_81 (Conv1D)           (None, 1, 16)             2752      
_________________________________________________________________
leaky_re_lu_116 (LeakyReLU)  (None, 1, 16)             0         
_________________________________________________________________
batch_normalization_78 (Batc (None, 1, 16)             64        
_________________________________________________________________
conv1d_82 (Conv1D)           (None, 1, 32)             1568      
_________________________________________________________________
leaky_re_lu_117 (LeakyReLU)  (None, 1, 32)             0         
_________________________________________________________________
batch_normalization_79 (Batc (None, 1, 32)             128       
_________________________________________________________________
dropout_58 (Dropout)         (None, 1, 32)             0         
__________

In [473]:
from keras import backend as K
INIT_LR = 5e-3  # initial learning rate
BATCH_SIZE = 32
EPOCHS = 100


# don't call K.set_learning_phase() !!! (otherwise will enable dropout in train/test simultaneously)
model = make_model()  # define our model

# prepare model for fitting (loss, optimizer, etc)
model.compile(
    loss='categorical_crossentropy',  # we train 10-way classification
    optimizer=keras.optimizers.adamax(lr=INIT_LR),  # for SGD
    metrics=['accuracy']  # report accuracy during training
)

# scheduler of learning rate (decay with epochs)
def lr_scheduler(epoch):
    return INIT_LR * 0.9 ** epoch

# callback for printing of actual learning rate used by optimizer
class LrHistory(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs={}):
        print("Learning rate:", K.get_value(model.optimizer.lr))

# fit model
model.fit(
    X_train, y_train2,  # prepared data
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[keras.callbacks.LearningRateScheduler(lr_scheduler), LrHistory()],
    validation_data=(X_test, y_test2),
    shuffle=True,
    verbose=0
)

Learning rate: 0.005
Learning rate: 0.0045
Learning rate: 0.00405
Learning rate: 0.003645
Learning rate: 0.0032805
Learning rate: 0.00295245
Learning rate: 0.002657205
Learning rate: 0.0023914846
Learning rate: 0.002152336
Learning rate: 0.0019371024
Learning rate: 0.0017433922
Learning rate: 0.0015690529
Learning rate: 0.0014121477
Learning rate: 0.001270933
Learning rate: 0.0011438397
Learning rate: 0.0010294557
Learning rate: 0.0009265101
Learning rate: 0.0008338591
Learning rate: 0.0007504732
Learning rate: 0.00067542586
Learning rate: 0.00060788327
Learning rate: 0.00054709497
Learning rate: 0.0004923855
Learning rate: 0.0004431469
Learning rate: 0.00039883223
Learning rate: 0.000358949
Learning rate: 0.0003230541
Learning rate: 0.0002907487
Learning rate: 0.00026167382
Learning rate: 0.00023550644
Learning rate: 0.00021195579
Learning rate: 0.00019076021
Learning rate: 0.0001716842
Learning rate: 0.00015451577
Learning rate: 0.0001390642
Learning rate: 0.00012515778
Learning rate

<keras.callbacks.History at 0x1a47f5ce50>

In [474]:
test_predictions = model.predict_proba(X_test).argmax(axis=-1)

In [475]:
test_answers = y_test2.argmax(axis=-1)

In [476]:
test_accuracy = np.mean(test_predictions==test_answers)

In [477]:
print(test_accuracy*100,'%')

76.74418604651163 %
