<a href="https://colab.research.google.com/github/amovar18/machinelearningproject/blob/master/DNAclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import sys
import pandas as pd
import sklearn
import numpy as np
names=['Class','id','sequence']
df=pd.read_csv("/content/promoters.data",names=names)
df.iloc[:5]

In [0]:
#build dataset
#each column in data frame is called series
classes=df.loc[:,'Class']

In [0]:
#generate list of DNA sequence
sequences=list(df.loc[:,'sequence'])
dataset={}
#loop through sequences and split into neucleotides
for i,seq in enumerate(sequences):
  #remove tabs and split nucleotides
  nucleotides=list(seq)
  nucleotides=[x for x in nucleotides if x!='\t']
  nucleotides.append(classes[i])
  dataset[i]=nucleotides

In [0]:
#convert dataset to a dataframe
data=pd.DataFrame(dataset)
data=data.transpose()
print(data.iloc[:4])

In [0]:
data.rename(columns={57:"Class"},inplace=True)

In [0]:
#record value counts for each sequence
series=[]
for name in data.columns:
  series.append(data[name].value_counts())
info=pd.DataFrame(series)
info=info.transpose()
print(info)

In [0]:
#switch to numerical data using get_dummmies
numerical_data=pd.get_dummies(data)
numerical_data.iloc[:5]

In [0]:
df=numerical_data.drop(columns=['Class_-'])
df.rename(columns={"Class_+":"Class"},inplace=True)
df.iloc[:5]

In [0]:
#splitting into training and testing dataset
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import classification_report,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
#define scoring method
scoring='accuracy'

In [0]:
#create x and y datasets for training and testing
x=np.array(df.drop(['Class'],1))
y=np.array(df['Class'])
#define a seed
seed=1
x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,test_size=0.25,random_state=seed)

In [0]:
#define models to train
names=['K Nearest Neighbors','GaussianProcess','Decision tree','Random Forest','Neural Net','AdaBoost','Naive bayes','SVMRBF','SVMLinear','SVMSigmoid']
classifier={
    KNeighborsClassifier(n_neighbors=3),
    GaussianProcessClassifier(1.0*RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5,n_estimators=10,max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel='rbf'),
    SVC(kernel='linear'),
    SVC(kernel='sigmoid')
}
models=zip(names,classifier)
#result of models:
names=[]
results=[]
for name,model in models:
  kfold=model_selection.KFold(n_splits=10)
  cvr=model_selection.cross_val_score(model,x_train,y_train,cv=kfold,scoring=scoring)
  results.append(cvr)
  names.append(name)
  msg="{0}:{1},({2})".format(name,cvr.mean(),cvr.std())
  print(msg)

models=zip(names,classifier)

In [0]:

#testing on the testing data set
for name, model in models:
  model.fit(x_train,y_train)
  pred=model.predict(x_test)
  print(name)
  print(accuracy_score(y_test,pred))
  print(classification_report(y_test,pred))