# Model Performance Evaluation
Work for Masters Thesis

Stanley Fujimoto

In [6]:
import pickle
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

%matplotlib inline



In [8]:
available_features_str = "aliscore,length,num_seqs,num_gaps,num_amino_acids,range,amino_acid_charged,amino_acid_uncharged,amino_acid_special,amino_acid_hydrophobic"
available_features = available_features_str.split( ',' )

## Data Prep

In [9]:
# load data
data = pickle.load( open( "../data/featurized_data.pickle", "rb" ) )
features = data[ available_features ]
labels = data[ "class" ]

In [10]:
features.head()


Unnamed: 0,aliscore,length,num_seqs,num_gaps,num_amino_acids,range,amino_acid_charged,amino_acid_uncharged,amino_acid_special,amino_acid_hydrophobic
0,299,448,87,28032,10944,60,6.490694,5.067389,4.222178,11.843168
1,182,331,76,12020,13136,59,5.273086,5.197975,3.442357,8.736644
2,543,956,94,48462,41402,60,21.572286,17.661384,17.679899,36.44906
3,388,623,84,30734,21598,60,15.44273,11.003807,6.451316,23.008377
4,162,260,77,13453,6567,60,3.231816,3.525321,1.75732,6.703005


In [11]:
labels.head()

0    H
1    H
2    H
3    H
4    H
Name: class, dtype: object

### split the data

In [12]:
x_train, x_test, y_train, y_test = train_test_split( features, labels, test_size = 0.2 )

## Model Testing

### SVM

In [13]:
svc = SVC()
svc.fit( x_train, y_train )
svc_preds = svc.predict( x_test )
accuracy_score( svc_preds, y_test )

0.48501362397820164

In [14]:
lsvc = LinearSVC()
lsvc.fit( x_train, y_train )
lsvc_preds = lsvc.predict( x_test )
accuracy_score( lsvc_preds, y_test )

0.88773841961852862

### MLP

In [15]:
mlp = MLPClassifier()
mlp.fit( x_train, y_train )
mlp_preds = mlp.predict( x_test )
accuracy_score( mlp_preds, y_test )

0.82779291553133516

### Logistic Regression

In [16]:
lr = LogisticRegression()
lr.fit( x_train, y_train )
lr_preds = lr.predict( x_test )
accuracy_score( lr_preds, y_test )

0.96457765667574935

### Random Forest

In [17]:
rf = RandomForestClassifier()
rf.fit( x_train, y_train )
rf_preds = rf.predict( x_test )
accuracy_score( rf_preds, y_test )

0.92643051771117169

### Naive Bayes

In [18]:
nb = MultinomialNB()
nb.fit( x_train, y_train )
nb_preds = nb.predict( x_test )
accuracy_score( nb_preds, y_test )

0.69972752043596731

### Meta

In [None]:
# code for meta classifier