In [1]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import util

In [2]:
def train_and_test(model, X, Y, datasets):
    '''
    Arguments
    - model: sklearn model
    - X: dict, str (dataset) -> numpy.ndarray (data). shape = (num_examples, num_features)
        Features. Example: X['train']
    - Y: dict, str (dataset) -> numpy.ndarray (data). shape = (num_examples,)
        Labels. Example: Y['train']
    - datasets: list of str
        Examples: 'train', 'test', 'score'
    
    Prints
    - Training confusion matrix
    - Testing confusion matrix
    - AUROC
    '''
    # fit model parameters
    model.fit(X['train'], Y['train'])
    
    pred = {}
    pred_prob = {}
    c_matrix = {}
    auroc = {}
    accuracy = {}
    for dataset in datasets:
        pred[dataset] = model.predict(X[dataset])
        pred_prob[dataset] = model.predict_proba(X[dataset])
        c_matrix[dataset] = confusion_matrix(Y[dataset], pred[dataset])
        auroc[dataset] = sklearn.metrics.roc_auc_score(Y[dataset], pred_prob[dataset][:,1])
        accuracy[dataset] = sklearn.metrics.accuracy_score(Y[dataset], pred[dataset])
        
        print(dataset + ' confusion matrix')
        print(c_matrix[dataset])
        
        print(dataset + ' accuracy: %0.3f' % accuracy[dataset])
        print(dataset + ' auroc: %0.3f' % auroc[dataset])
        print('')

In [3]:
assay_name = 'nr-ahr' # 'nr-ar-lbd', 'nr-aromatase', 'nr-ar', 'nr-er-lbd', 'nr-er', 'nr-ppar-gamma', 'sr-are', 'sr-atad5', 'sr-hse', 'sr-mmp', 'sr-p53'
data_dir = 'data_features'
data_file_ext = 'features'

datasets = ['train', 'test', 'score']
X, Y = {}, {}
filenames = util.get_data_filenames(data_dir, data_file_ext, assay_name)
for dataset in datasets:
    X[dataset], Y[dataset] = util.read_features(filenames[dataset], header=0)

In [4]:
# Comparing different models
models = []
models.append(GaussianNB())  # Naive Bayes, Gaussian
models.append(BernoulliNB()) # Naive Bayes, Bernouilli
models.append(DecisionTreeClassifier()) # Decision tree
models.append(RandomForestClassifier()) # Random forest
models.append(MLPClassifier()) # Neural network
models.append(MLPClassifier(hidden_layer_sizes=(512,),activation='relu',batch_size=100,alpha=0.1,max_iter=4)) # custom neural net

for model in models:
    print(type(model))
    train_and_test(model, X, Y, datasets)
    print('\n---\n')

<class 'sklearn.naive_bayes.GaussianNB'>
train confusion matrix
[[3314 2620]
 [  54  707]]
train accuracy: 0.601
train auroc: 0.770

test confusion matrix
[[ 74 167]
 [  2  28]]
test accuracy: 0.376
test auroc: 0.633

score confusion matrix
[[266 268]
 [  3  70]]
score accuracy: 0.554
score auroc: 0.735


---

<class 'sklearn.naive_bayes.BernoulliNB'>
train confusion matrix
[[4043 1891]
 [ 132  629]]
train accuracy: 0.698
train auroc: 0.805

test confusion matrix
[[ 99 142]
 [  4  26]]
test accuracy: 0.461
test auroc: 0.710

score confusion matrix
[[333 201]
 [ 11  62]]
score accuracy: 0.651
score auroc: 0.796


---

<class 'sklearn.tree.tree.DecisionTreeClassifier'>
train confusion matrix
[[5934    0]
 [   3  758]]
train accuracy: 1.000
train auroc: 1.000

test confusion matrix
[[200  41]
 [ 23   7]]
test accuracy: 0.764
test auroc: 0.532

score confusion matrix
[[482  52]
 [ 34  39]]
score accuracy: 0.858
score auroc: 0.724


---

<class 'sklearn.ensemble.forest.RandomForestClassifie



train confusion matrix
[[5791  143]
 [ 464  297]]
train accuracy: 0.909
train auroc: 0.888

test confusion matrix
[[214  27]
 [ 18  12]]
test accuracy: 0.834
test auroc: 0.791

score confusion matrix
[[500  34]
 [ 40  33]]
score accuracy: 0.878
score auroc: 0.862


---

