In [1]:
import sys
import sklearn
import numpy as np
import pandas as pd

# import different classifiers from scikitlearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import model_selection

### Import the Molecular Biology (Promoter Gene Sequences) Data Set

In [3]:
# web address of the data set
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/molecular-biology/promoter-gene-sequences/promoters.data'
columns = ['Class', 'id', 'Sequence']
data = pd.read_csv(url, names = columns)
print(data.head())

  Class         id                                           Sequence
0     +        S10  \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1     +       AMPC  \t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2     +       AROH  \t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3     +      DEOP2  \taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
4     +  LEU1_TRNA  \ttcgataattaactattgacgaaaagctgaaaaccactagaatgc...


### Pre process the data set

In [4]:
# extract the classes from the data set
classes = data['Class']

# extract the DNA sequences from the data set
sequences = data['Sequence']

print(classes.head(4))
print(sequences.head(4))

0    +
1    +
2    +
3    +
Name: Class, dtype: object
0    \t\ttactagcaatacgcttgcgttcggtggttaagtatgtataat...
1    \t\ttgctatcctgacagttgtcacgctgattggtgtcgttacaat...
2    \t\tgtactagagaactagtgcattagcttatttttttgttatcat...
3    \taattgtgatgtgtatcgaagtgtgttgcggagtagatgttagaa...
Name: Sequence, dtype: object


In [5]:
# remove the leading \tabs from sequences and append the class of the sequence
sequences = list(sequences)
dataset = {}
for i, seq in enumerate(sequences):
    bases = list(seq)
    bases = [x for x in bases if x != '\t']
    bases.append(classes[i])
    dataset[i] = bases
# convert the pandas dataframe
df = pd.DataFrame(dataset)
df = df.transpose()
print(df.head())

  0  1  2  3  4  5  6  7  8  9   ... 48 49 50 51 52 53 54 55 56 57
0  t  a  c  t  a  g  c  a  a  t  ...  g  c  t  t  g  t  c  g  t  +
1  t  g  c  t  a  t  c  c  t  g  ...  c  a  t  c  g  c  c  a  a  +
2  g  t  a  c  t  a  g  a  g  a  ...  c  a  c  c  c  g  g  c  g  +
3  a  a  t  t  g  t  g  a  t  g  ...  a  a  c  a  a  a  c  t  c  +
4  t  c  g  a  t  a  a  t  t  a  ...  c  c  g  t  g  g  t  a  g  +

[5 rows x 58 columns]


In [7]:
df.rename(columns = {57: 'Class'}, inplace=True)

In [8]:
# calculate the ratio of each base in a sequence
seq = []
for column in df.columns:
    seq.append(df[column].value_counts())

results = pd.DataFrame(seq).transpose()
print(results)


      0     1     2     3     4     5     6     7     8     9  ...    48  \
t  38.0  26.0  27.0  26.0  22.0  24.0  30.0  32.0  32.0  28.0  ...  21.0   
c  27.0  22.0  21.0  30.0  19.0  18.0  21.0  20.0  22.0  22.0  ...  36.0   
a  26.0  34.0  30.0  22.0  36.0  42.0  38.0  34.0  33.0  36.0  ...  23.0   
g  15.0  24.0  28.0  28.0  29.0  22.0  17.0  20.0  19.0  20.0  ...  26.0   
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   NaN   

     49    50    51    52    53    54    55    56  Class  
t  22.0  23.0  33.0  35.0  30.0  23.0  29.0  34.0    NaN  
c  42.0  31.0  32.0  21.0  32.0  29.0  29.0  17.0    NaN  
a  24.0  28.0  27.0  25.0  22.0  26.0  24.0  27.0    NaN  
g  18.0  24.0  14.0  25.0  22.0  28.0  24.0  28.0    NaN  
-   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  
+   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   53.0  

[6 rows x 58 columns]


In [9]:
# replace the alphabets with numeric values
df = pd.get_dummies(df)
df = df.drop(columns=['Class_-'])
df.rename(columns = {'Class_+': 'Class'}, inplace=True)
print(df.head())

   0_a  0_c  0_g  0_t  1_a  1_c  1_g  1_t  2_a  2_c  ...  54_t  55_a  55_c  \
0    0    0    0    1    1    0    0    0    0    1  ...     0     0     0   
1    0    0    0    1    0    0    1    0    0    1  ...     0     1     0   
2    0    0    1    0    0    0    0    1    1    0  ...     0     0     1   
3    1    0    0    0    1    0    0    0    0    0  ...     0     0     0   
4    0    0    0    1    0    1    0    0    0    0  ...     1     1     0   

   55_g  55_t  56_a  56_c  56_g  56_t  Class  
0     1     0     0     0     0     1      1  
1     0     0     1     0     0     0      1  
2     0     0     0     0     1     0      1  
3     0     1     0     1     0     0      1  
4     0     0     0     0     1     0      1  

[5 rows x 229 columns]


In [10]:
# Get all the relevent columns from the DataFrame
columns = df.columns.tolist()

# Filter the columns to remove the data we don't need for training
columns = [c for c in columns if c not in ["Class"]]

# Store the labels separetely
target = "Class"

x = df[columns]
y = df[target]

print(x.shape)
print(y.shape)

(106, 228)
(106,)


In [11]:
seed = 1
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.25, random_state=seed)

In [12]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(79, 228)
(79,)
(27, 228)
(27,)


In [13]:
# get the models
modelNames = ['SVC Linear', 'SVC RBF', 'SVC Simoid', 'K-Nearest Neighbours', 'Decision Tree',
              'Gaussian Process', 'Random Forest', 'Neural Net', 'AdaBoost', 'Naive Bayes']

classifiers = [SVC(kernel='linear'),
               SVC(kernel='rbf'),
               SVC(kernel='sigmoid'),
               KNeighborsClassifier(n_neighbors=3),
               DecisionTreeClassifier(max_depth=5),
               GaussianProcessClassifier(1.0 * RBF(1.0)),
               RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
               MLPClassifier(alpha=1, max_iter=2000),
               AdaBoostClassifier(),
               GaussianNB()
    
]

models = list(zip(modelNames, classifiers))
#print(models)

In [14]:
# fit the models

scoring = 'accuracy'
results = []
modelNames = []

for name, model in models:
    k_fold = model_selection.KFold(n_splits=10, shuffle=True, random_state=seed)
    val_results = model_selection.cross_val_score(model, x_train, y_train, cv=k_fold, scoring=scoring)
    results.append(val_results)
    modelNames.append(name)
    print('{}: {} ({})'.format(name, val_results.mean(), val_results.std()))

SVC Linear: 0.9125 (0.09762812094883318)
SVC RBF: 0.875 (0.11180339887498948)
SVC Simoid: 0.925 (0.1)
K-Nearest Neighbours: 0.8107142857142857 (0.099808490089158)
Decision Tree: 0.7071428571428571 (0.1524042020953705)
Gaussian Process: 0.8553571428571429 (0.1606051216556957)
Random Forest: 0.6160714285714286 (0.18250139781404862)
Neural Net: 0.9125 (0.09762812094883318)
AdaBoost: 0.8625 (0.14197270864500683)
Naive Bayes: 0.8375 (0.1125)


In [15]:
# test on tesing data

for name, model in models:
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    print(name)
    print(accuracy_score((y_test), predictions))
    print(classification_report(y_test, predictions))

SVC Linear
0.9629629629629629
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        17
           1       0.91      1.00      0.95        10

    accuracy                           0.96        27
   macro avg       0.95      0.97      0.96        27
weighted avg       0.97      0.96      0.96        27

SVC RBF
0.9259259259259259
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        17
           1       0.83      1.00      0.91        10

    accuracy                           0.93        27
   macro avg       0.92      0.94      0.92        27
weighted avg       0.94      0.93      0.93        27

SVC Simoid
0.9259259259259259
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        17
           1       0.83      1.00      0.91        10

    accuracy                           0.93        27
   macro avg       0.92      0.94      0.