In [1]:
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd
from sklearn import decomposition
from sklearn import metrics
from sklearn import preprocessing
import os
import random
import dpp
%matplotlib inline

In [2]:
# METADATA 
DATASET = 1


In [3]:
def whiten_data(data):
    """
    Columns have zero mean and unit variance
    """
    return preprocessing.scale(data)

In [4]:
if DATASET == 0:
    path = os.getcwd()
    testfile= path + '/data/d1/data_set_ALL_AML_independent.csv'
    trainfile=path + '/data/d1/data_set_ALL_AML_train.csv'
    patient_cancer=path + '/data/d1/actual.csv'
    
    # load csv
    train = pd.read_csv(trainfile)
    #train = train[train.columns[::20]]
    test = pd.read_csv(testfile)
    #test = test[test.columns[::20]]
    patient_cancer = pd.read_csv(patient_cancer)
    
    # filter columns
    train_keepers = [col for col in train.columns if "call" not in col]
    test_keepers = [col for col in test.columns if "call" not in col]
    train = train[train_keepers]
    test = test[test_keepers]
    
    # remove columns with text (Gene names)
    gene_description = train['Gene Description']
    gene_accession_number = train['Gene Accession Number']
    train = train.drop(['Gene Description','Gene Accession Number'],axis=1)
    test = test.drop(['Gene Description','Gene Accession Number'],axis=1)
    
    # transpose data to have each row represent a patient
    train = train.transpose()
    test = test.transpose()
    
    # zero mean and unit variance the data
    train = whiten_data(train)
    test = whiten_data(test)
    
    # initialize the labels for the data
    # patient cancer labels to 0 (ALL) and 1 (AML)
    patient_cancer = np.array(patient_cancer)
    patient_labels = []
    for p in patient_cancer:
        if p[1] == 'ALL':
            patient_labels.append(0)
        else:
            patient_labels.append(1)
    train_labels = patient_labels[0:len(train)]
    test_labels = patient_labels[len(train):]
    

    
    

In [5]:
if DATASET == 1:
    path = os.getcwd()
    all_data_path = path +'/data/d2/data.csv'
    all_labels_path=path + '/data/d2/labels.csv'
    
    # load csv
    all_data = pd.read_csv(all_data_path)
    
    all_labels = pd.read_csv(all_labels_path)
    
    # drop first column of all_data
    all_data = all_data.as_matrix()
    all_data = all_data[:, 1:]
    all_data = all_data[:,[5*x for x in range(all_data.shape[1]/5)]]#samples every nth
    # drop first column of labels_data
    labels_dict = {'BRCA':0, 'KIRC':1, 'PRAD':2, 'LUAD':3, 'COAD':4}
    all_labels = all_labels.as_matrix()[:,1]
    
    all_labels_num = []
    for label in all_labels:
        all_labels_num.append(labels_dict[label])
    all_labels_num = np.array(all_labels_num)  
    # whiten data
    all_data = whiten_data(all_data)
    
    # randomize indices and split into train and test
    indices = [i for i in range(len(all_labels_num))]
    random.shuffle(indices)
    NUM_TRAIN = int(0.8*len(indices))
    train = all_data[indices[:NUM_TRAIN],:]
    train_labels = all_labels_num[indices[:NUM_TRAIN]]
    test = all_data[indices[NUM_TRAIN:],:]
    test_labels = all_labels_num[indices[NUM_TRAIN:]]
    print test.shape
    
    

(161, 4106)




In [6]:
# Distribution of class labels
print len([a for a in all_labels_num if a == 0])
print len([a for a in all_labels_num if a == 1])
print len([a for a in all_labels_num if a == 2])
print len([a for a in all_labels_num if a == 3])
print len([a for a in all_labels_num if a == 4])


300
146
136
141
78


### Naive Bayes Classifier

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
def _naive_bayes(ftrain, ftest):
    clf = GaussianNB()
    clf.fit(ftrain, train_labels)
    predicted = clf.predict(ftest)
    acc = accuracy_score(test_labels, predicted)
#     print(metrics.classification_report(test_labels, predicted))
    return acc

### SVM Classifier

In [8]:
from sklearn import svm
clf = svm.SVC()
# Try training on entire feature set
clf.fit(train, train_labels)
predicted = clf.predict(test)
print(metrics.classification_report(test_labels, predicted))

def _svm(ftrain, ftest):
    clf = svm.SVC()
    clf.fit(ftrain, train_labels)
    predicted = clf.predict(ftest)
    acc = accuracy_score(test_labels, predicted)
    return acc

             precision    recall  f1-score   support

          0       0.95      1.00      0.97        54
          1       1.00      0.94      0.97        35
          2       1.00      0.97      0.99        36
          3       1.00      1.00      1.00        22
          4       1.00      1.00      1.00        14

avg / total       0.98      0.98      0.98       161



### KNN Classifier

In [9]:
from sklearn.neighbors import KNeighborsClassifier

def _knn(k,ftrain, ftest):
    neighbors = KNeighborsClassifier(n_neighbors = k)
    neighbors.fit(ftrain,train_labels)
    predicted = neighbors.predict(ftest)
    acc = accuracy_score(test_labels, predicted)
    return acc

### Nearest PSD Matrix

In [10]:
def nearPSD(A,epsilon=0):
   n = A.shape[0]
   eigval, eigvec = np.linalg.eig(A)
   val = np.matrix(np.maximum(eigval,epsilon))
   vec = np.matrix(eigvec)
   T = 1/(np.multiply(vec,vec) * val.T)
   T = np.matrix(np.sqrt(np.diag(np.array(T).reshape((n)) )))
   B = T * vec * np.diag(np.array(np.sqrt(val)).reshape((n)))
   out = B*B.T
   return(out)

# PCA Analysis

In [None]:
from sklearn.decomposition import PCA
NUM_COMPONENTS = 2
def _pca(data, num_comp):
    pca = PCA(n_components=num_comp)
    pca.fit(data)
    data_pca = pca.transform(data)
    return data_pca

In [None]:
# train
train_pca = _pca(train, NUM_COMPONENTS)
test_pca = _pca(test, NUM_COMPONENTS)

In [None]:
train_pca.shape
for i,p in enumerate(train_pca):
    if train_labels[i] == 0:
        plt.scatter(train_pca[i,0], train_pca[i,1], color='r', label='ALL')
    else:
        plt.scatter(train_pca[i,0], train_pca[i,1], color = 'b', label='AML')

plt.show()

## Naive Bayes

In [None]:
max_components = 300
accuracies = []
for i in range(50,max_components+1,50):
    nc = i 
    print " "
    print i
    nc_pca_train = _pca(train, nc)
    print nc_pca_train.shape
    
    nc_pca_test = _pca(test, nc)
    print nc_pca_test.shape
    accuracies.append(_naive_bayes(nc_pca_train, nc_pca_test))
#plt.plot([i+1 for i in range(max_components)], accuracies)
#plt.show()
pca_nb_accuracies = accuracies
    

## SVM

In [None]:
max_components = 300
accuracies = []
for i in range(50,max_components+1,50):
    nc = i
    nc_pca_train = _pca(train, nc)
    nc_pca_test = _pca(test, nc)
    accuracies.append(_svm(nc_pca_train, nc_pca_test))
#plt.plot([i+1 for i in range(max_components)], accuracies)
#plt.show()

pca_svm_accuracies = accuracies

print pca_svm_accuracies

## KNN with k = 10

In [None]:
max_components = 300
accuracies = []
k=10
for i in range(50,max_components+1,50):
    nc = i
    nc_pca_train = _pca(train, nc)
    nc_pca_test = _pca(test, nc)
    accuracies.append(_knn(k,nc_pca_train, nc_pca_test))
#plt.plot([i+1 for i in range(max_components)], accuracies)
#plt.show()

pca_knn_accuracies = accuracies


# F Score Feature Selection

In [11]:
import fscore_select_k_2 as fsk

In [12]:
h, w = train.shape
fsk_mat = np.zeros((h, w+1))
fsk_mat[:,:w] = train
fsk_mat[:, w] = train_labels
k = 350

In [13]:
fsk_op = fsk.GeneSelection2(fsk_mat)
ordered_fscore_columns = fsk_op.select_k(k) # sorted list of f scores with the column they correspond to

0


  f = msb / msw


50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100


## Naive Bayes

In [14]:
# Visualize for classifier X, how does accuracy vary with number of features selected (1 to k)
num_components = 300
fs_nb_accuracies = []
for i in range(50,num_components+1,50):
    nc = i
    fs_train = train[:, ordered_fscore_columns[:nc]]
    fs_test = test[:, ordered_fscore_columns[:nc]]
    fs_nb_accuracies.append(_naive_bayes(fs_train, fs_test))
#plt.plot([i+1 for i in range(num_components)], fs_nb_accuracies)
#plt.show()

## SVM

In [15]:
num_components = 300
fs_svm_accuracies = []
for i in range(50,num_components+1,50):
    nc = i
    fs_train = train[:, ordered_fscore_columns[:nc]]
    fs_test = test[:, ordered_fscore_columns[:nc]]
    fs_svm_accuracies.append(_svm(fs_train, fs_test))
#plt.plot([i+1 for i in range(num_components)], fs_svm_accuracies)
#plt.show()

## KNN with K = 10

In [16]:
num_components = 300
fs_knn_accuracies = []
k=10
for i in range(50,num_components+1,50):
    nc = i
    fs_train = train[:, ordered_fscore_columns[:nc]]
    fs_test = test[:, ordered_fscore_columns[:nc]]
    fs_knn_accuracies.append(_knn(k,fs_train, fs_test))

# DPP Feature Selection

In [17]:
import sys
import os

#sys.path.append("./dpp")
print os.getcwd()
#sys.path.insert(0, os.getcwd()+'/dpp')
sys.path.append(os.getcwd()+'/dpp')
print sys.path
from dpp import dpp as dpp


/home/jacob/Downloads/genome_correlation-master/code
['', '/home/jacob/anaconda2/lib/python27.zip', '/home/jacob/anaconda2/lib/python2.7', '/home/jacob/anaconda2/lib/python2.7/plat-linux2', '/home/jacob/anaconda2/lib/python2.7/lib-tk', '/home/jacob/anaconda2/lib/python2.7/lib-old', '/home/jacob/anaconda2/lib/python2.7/lib-dynload', '/home/jacob/anaconda2/lib/python2.7/site-packages', '/home/jacob/anaconda2/lib/python2.7/site-packages/IPython/extensions', '/home/jacob/.ipython', '/home/jacob/Downloads/genome_correlation-master/code/dpp']


In [18]:
# sigmas = [0.0001, 0.1, 1, 2, 10]
sigmas = [1]
k = 1
train_cov = np.abs(np.cov(train.T))
train_cov = nearPSD(train_cov,0)
gene_indices = [i for i in range(train.shape[1])]

  


## Naive Bayes

In [19]:
# Naive Bayes
dpp_nb_accuracies = []
num_components = 300
for i in range(50,num_components+1,50):
    nc = i
    samples = []
    #average over 5 simulations
    print i
    for simulation in range(5):
        selected_features = dpp.sample_k(gene_indices, train_cov, nc, max_nb_iterations=500)  
        dpp_train = train[:, selected_features]
        dpp_test = test[:, selected_features]
        samples.append(_naive_bayes(dpp_train, dpp_test))
        
    dpp_nb_accuracies.append(np.float64(sum(samples)) / np.float64(len(samples)))

#plt.plot([i+1 for i in range(num_components)], dpp_nb_accuracies)
#plt.show()


50
100
150
200
250
300


## SVM

In [20]:
# SVM
dpp_svm_accuracies = []
num_components = 300
for i in range(50,num_components+1,50):
    nc = i
    samples = []
    #average over 5 experiments
    for simulation in range(5):
        selected_features = dpp.sample_k(gene_indices, train_cov, nc, max_nb_iterations=500)  
        dpp_train = train[:, selected_features]
        dpp_test = test[:, selected_features]
        samples.append(_svm(dpp_train, dpp_test))

    dpp_svm_accuracies.append(np.float64(sum(samples))/np.float64(len(samples)))
    
#plt.plot([i+1 for i in range(num_components)], dpp_svm_accuracies)
#plt.show()

## KNN with k = 10

In [21]:
dpp_knn_accuracies = []
num_components = 300
k=10
for i in range(50,num_components+1,50):
    nc = i
    samples = []
    #average over 5 experiments
    for simulation in range(5):
        selected_features = dpp.sample_k(gene_indices, train_cov, nc, max_nb_iterations=500)  
        dpp_train = train[:, selected_features]
        dpp_test = test[:, selected_features]
        samples.append(_knn(k,dpp_train, dpp_test))

    dpp_knn_accuracies.append(np.float64(sum(samples))/np.float64(len(samples)))

# DPP + FS 50-50

In [22]:
#this method takes selects k/2 genes from the f-score and k/2 from the DPP
def fs_dpp_select_k(k, train, train_cov, fsk_mat):
    fsk_obj = fsk.GeneSelection2(fsk_mat)
    f_score_genes = fsk_obj.select_k(int(k/2))
    f_score_genes = set(f_score_genes)
    gene_indices = [i for i in range(train.shape[1])]

    dpp_k_features = dpp.sample_k(gene_indices, train_cov, k, max_nb_iterations=500)


    for gene in dpp_k_features:
            #if we still don't have k genes, then add genes into
            if gene not in f_score_genes:
                f_score_genes.add(gene)

            #we have reached k genes thus we can stop
            if len(f_score_genes) == k:
                break


    return list(f_score_genes)

## Naive Bayes

In [23]:
#Trains Naive Bayes on features selected from DPP + F score
dpp_fs_nb_accuracies = []
num_components = 300
for i in range(50,num_components+1,50):
    nc = i
    samples = []
    #average over 5 experiments
    for simulation in range(5):
        selected_features = fs_dpp_select_k(nc,train, train_cov,fsk_mat)  
        dpp_fs_train = train[:, selected_features]
        dpp_fs_test = test[:, selected_features]
        samples.append(_naive_bayes(dpp_fs_train, dpp_fs_test))

    dpp_fs_nb_accuracies.append(np.float64(sum(samples))/np.float64(len(samples)))

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350

3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000

## SVM

In [24]:
#Trains SVM on features selected from DPP + F score
dpp_fs_svm_accuracies = []
num_components = 300
for i in range(50,num_components+1,50):
    nc = i
    samples = []
    #average over 5 experiments
    for simulation in range(5):
        selected_features = fs_dpp_select_k(nc,train, train_cov,fsk_mat)  
        
        dpp_fs_train = train[:, selected_features]
        
        dpp_fs_test = test[:, selected_features]
        samples.append(_svm(dpp_fs_train, dpp_fs_test))
        print samples[-1]
        
    dpp_fs_svm_accuracies.append(np.float64(sum(samples))/np.float64(len(samples)))
    
print dpp_fs_svm_accuracies

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0.968944099379
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0.993788819876
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050

1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0.993788819876
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0.975155279503
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3

## KNN with k = 10

In [25]:
#Trains KNN on features selected from DPP + F score
dpp_fs_knn_accuracies = []
num_components = 300
k=10
for i in range(50,num_components+1,50):
    nc = i
    samples = []
    #average over 5 experiments
    for simulation in range(5):
        selected_features = fs_dpp_select_k(nc,train, train_cov,fsk_mat)  
        dpp_fs_train = train[:, selected_features]
        dpp_fs_test = test[:, selected_features]
        samples.append(_knn(k,dpp_train, dpp_test))

    dpp_fs_knn_accuracies.append(np.float64(sum(samples))/np.float64(len(samples)))

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350

3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000

In [26]:
my_dict = dict()
#my_dict['pca_nb']= pca_nb_accuracies
my_dict['fs_nb']=fs_nb_accuracies
my_dict['dpp_nb']=dpp_nb_accuracies 
my_dict['dpp_fs_nb']= dpp_fs_nb_accuracies

#my_dict['pca_svm']=pca_svm_accuracies
my_dict['fs_svm']=fs_svm_accuracies
my_dict['dpp_svm']= dpp_svm_accuracies
my_dict['dpp_fs_svm']=dpp_fs_svm_accuracies

#my_dict['pca_knn']=pca_knn_accuracies
my_dict['fs_knn']=fs_knn_accuracies
my_dict['dpp_knn']=dpp_knn_accuracies
my_dict['dpp_fs_knn']=dpp_fs_knn_accuracies

np.save('data_vals2.npy', my_dict) 

## Visualizations

#### Visualize Results

In [None]:
def visualize(dpp_vals,f_score_vals,_combined_vals,pca_vals,title):

    n_groups = 6

    # create plot
    fig, ax = plt.subplots()
    index = np.arange(n_groups)
    bar_width = 0.1
    opacity = 0.8

    rects1 = plt.bar(index, dpp_vals, bar_width,
                     alpha=opacity,
                     color='b',
                     label='DPP')

    rects2 = plt.bar(index + bar_width, f_score_vals, bar_width,
                     alpha=opacity,
                     color='g',
                     label='F_score')


    rects3 = plt.bar(index + 2*bar_width, _combined_vals, bar_width,
                     alpha=opacity,
                     color='r',
                     label='Combined')


    rects4 = plt.bar(index + 3*bar_width, pca_vals, bar_width,
                     alpha=opacity,
                     color='y',
                     label='PCA')

    plt.xlabel('K')
    plt.ylabel('Accuracy')
    plt.title(title)
    plt.xticks(index + bar_width, ('5', '10', '15', '20','25','30'))
    plt.legend()

    plt.tight_layout()
    plt.show()

## Naive Bayes - Multiple Feature Selection Methods

In [None]:
#visualize(dpp_nb_accuracies,fs_nb_accuracies,dpp_fs_nb_accuracies,pca_nb_accuracies,'Naive Bayes')

## SVM - Multiple Feature Selection Methods

In [None]:
#visualize(dpp_svm_accuracies,fs_svm_accuracies,dpp_fs_svm_accuracies,pca_svm_accuracies,'SVM')

## KNN - Multiple Feature Selection Methods

In [None]:
#visualize(dpp_knn_acc,fs_knn_accuracies,dpp_fs_knn_accuracies,pca_knn_acc,'KNN')