In [1]:
import numpy as np
import matplotlib.pyplot as plt

standardize = True

files = ['data-hl.txt','data-hs.txt','data-ne.txt','data-sl.txt','data-ss.txt']
data = np.zeros([5*300,42])

print 'read data files...'
i = 0
for file in files:
    j = 0
    for line in open(file,'r'):
        if line.split('\t')[0] != 'sclass':
            values = np.array(line.split('\t'))
            values[values == 'NA'] = 'nan'
            values[values == 'NA\n'] = 'nan'
            # save the class: points in data-hl.txt have class 0, points in data-hs.txt are in class 1, etc.
            data[i*300+j,0] = i
            data[i*300+j,1:] = values[1:].astype(float)
            j = j + 1
        else:
            header = line.split('\t')[1:]
            header = np.array(header)
    i = i + 1 
print 'done'    

if standardize:
    print 'standardize data...'
    # This is necessary because different features have different dynamic ranges. Standardization brings all features 
    # to the same scale.

    mean = np.mean(data,axis=0)
    std = np.std(data,axis=0)

    data_standard = np.zeros(np.shape(data))
    data_mask = np.zeros(np.shape(data))
    # save the classes
    data_standard[:,0] = data[:,0]
    data_mask = np.isfinite(data)
    # standardize data
    for i in range(41):
        a = data[:,i+1]
        mask_a = data_mask[:,i+1]
        mean = np.mean(a[mask_a]) 
        std = np.std(a[mask_a])

        data_standard[mask_a,i+1] = (a[mask_a] - mean)/std

    print 'done'

    print 'calculate Fisher discriminant ratio (FDR)...'
    # FDR = sum sum (mu_i - mu_j)^2 / (sigma_i^2 + sigma_j^2), where i and j are different classes, mu is the mean, 
    # sigma is the variance. Sum goes over all i-j pairs excluding i=j. 
    # The idea is that features with large differences in the class-specific means and small variances in each class
    # are better at distinguishing classes.
    # For more details on FDR see e.g., Lin et al., J. Chem. Inf. Comput. Sci. 2004, 44, 76-87

    FDR = np.zeros(41)
    for i in range(41):
        FDR_sum = 0e0
        for j in range(5):
            mask_j = data_mask[:,i+1] & (data_standard[:,0] == j)
            for k in range(j):
                mask_k = data_mask[:,i+1] & (data_standard[:,0] == k)

                mu_j = np.mean(data_standard[mask_j,i+1])
                mu_k = np.mean(data_standard[mask_k,i+1])
                sigma_j = np.var(data_standard[mask_j,i+1])
                sigma_k = np.var(data_standard[mask_k,i+1])

                FDR_sum = FDR_sum + (mu_j-mu_k)**2e0 / (sigma_j**2e0 + sigma_k**2e0)
        # check the values     
        #print i,FDR_sum
        #for j in range(5):
        #    mask_j = data_mask[:,i+1] & (data_standard[:,0] == j)
        #    print '   ',np.mean(data_standard[mask_j,i+1])/np.var(data_standard[mask_j,i+1])
        FDR[i] = FDR_sum
else:
    print 'calculate Fisher discriminant ratio (FDR)...'
    # FDR = sum sum (mu_i - mu_j)^2 / (sigma_i^2 + sigma_j^2), where i and j are different classes, mu is the mean, 
    # sigma is the variance. Sum goes over all i-j pairs excluding i=j. 
    # The idea is that features with large differences in the class-specific means and small variances in each class
    # are better at distinguishing classes.
    # For more details on FDR see e.g., Lin et al., J. Chem. Inf. Comput. Sci. 2004, 44, 76-87
    
    data_mask = np.zeros(np.shape(data))
    data_mask = np.isfinite(data)
    
    FDR = np.zeros(41)
    for i in range(41):
        FDR_sum = 0e0
        for j in range(5):
            mask_j = data_mask[:,i+1] & (data[:,0] == j)
            for k in range(j):
                mask_k = data_mask[:,i+1] & (data[:,0] == k)

                mu_j = np.mean(data[mask_j,i+1])
                mu_k = np.mean(data[mask_k,i+1])
                sigma_j = np.var(data[mask_j,i+1])
                sigma_k = np.var(data[mask_k,i+1])

                FDR_sum = FDR_sum + (mu_j-mu_k)**2e0 / (sigma_j**2e0 + sigma_k**2e0)
        # check the values     
        #print i,FDR_sum
        #for j in range(5):
        #    mask_j = data_mask[:,i+1] & (data_standard[:,0] == j)
        #    print '   ',np.mean(data_standard[mask_j,i+1])/np.var(data_standard[mask_j,i+1])
        FDR[i] = FDR_sum
    
    
print 'done'

print 'rank the features...'
indx_sorted = np.argsort(FDR)[::-1]
print '   features sorted in order of how well they discriminate between different classes (first item is best):'
print '  ',header[indx_sorted]

# add 1 and insert 0 to the first place to keep the class in.
indx_sorted = np.insert(indx_sorted+1,0,0)

print 'done'


read data files...
done
standardize data...
done
calculate Fisher discriminant ratio (FDR)...
done
rank the features...
   features sorted in order of how well they discriminate between different classes (first item is best):
   ['ThetaPi_1' 'H2.H1_1' 'H1_1' 'Theta1Pi_1' 'H12_1' 'ThetaS_1' 'DAF_1'
 'Theta1S_1' 'TajD_1' 'FuLiF_1' 'Theta1L_1' 'FuLiD_1' 'DXPEHH_12'
 'FuLiF1_1' 'TajD1_1' 'DXPEHH_13\n' 'FuLiD1_1' 'FayWuH_1' 'FST_1'
 'XPEHH_12' 'SL1_1' 'H2_1' 'XPEHH_13' 'ThetaL_1' 'SL0_1' 'iHH0_1'
 'Theta1H_1' 'DDAF_1' 'Theta1Xi_1' 'nSL_1' 'iHS_1' 'FayWuH1_1' 'DnSL_1'
 'iHH1_1' 'ZengE1_1' 'MAF_1' 'ZengE_1' 'ZA_1' 'ThetaXi_1' 'DiHH_1'
 'ThetaH_1']
done


In [2]:
# general naive bayes classifier
# the true probability distribution of the features is estimated using a gaussian kernel density estimator
# https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/
# scipy's kde is fastest for a few 100 data points.

from scipy.stats import gaussian_kde

n_sim = 100

test_score = np.zeros([n_sim,40])

# test what bandwidth value gives best scores
bandwidth2 = 10e0**(np.linspace(-1,0,num=5,endpoint = True))


for bw in bandwidth2:
    print bw
    
    # run n_sim different simulations to get a feeling of random effects
    for ii in range(n_sim):
        if ii%10 == 0:
            print '   ',ii

        # remove one feature at a time (the least disciminative one)
        for jj in range(40):

            if standardize:
                data_to_use = data_standard[:,indx_sorted[:-(jj+1)]]
            else:
                data_to_use = data[:,indx_sorted[:-(jj+1)]] 

            nr_classes = len(np.unique(data_to_use[:,0]))
            nr_features = np.shape(data_to_use[:,1:])[1]

            # shuffle and divide up the data
            n = np.shape(data_to_use)[0]
            indx = np.arange(n)
            np.random.shuffle(indx)

            X_train = data_to_use[indx[:n*0.75],1:]
            Y_train = data_to_use[indx[:n*0.75],0]

            X_test = data_to_use[indx[n*0.75:],1:]
            Y_test = data_to_use[indx[n*0.75:],0]


            # collect kernels for each class and feature
            kernels = []
            for i in range(nr_classes):
                mask = Y_train == i
                points_in_class = X_train[mask,:]

                kernels_class_i = []
                
                for j in range(nr_features):
                    feature_j = points_in_class[:,j]
                    mask = np.isfinite(feature_j)

                    kernels_class_i.append(gaussian_kde(feature_j[mask],bw_method = bw))

                kernels.append(kernels_class_i)


            # loop through the test points and estimate the most likely class
            score = 0e0
            for i in range(len(Y_test)): 

                class_prob = np.zeros(nr_classes)
                for j in range(nr_classes):
                    kernels_class_j = kernels[j]
                    
                    #for k in range(nr_features):
                    #    kernel = kernels_class_j[k]
                    #    class_prob[j] = class_prob[j] + kernel.logpdf(X_test[i,k])
                    
                    class_prob[j] = np.sum([kernel.logpdf(point) for kernel, point in zip(kernels_class_j, X_test[i,:])])

                if np.argmax(class_prob) == Y_test[i]:
                    score = score + 1

            test_score[ii,jj] = score / len(Y_test)

    # make a plot            
    plt.close()
    plt.ylim([0,80])
    plt.errorbar(41 - (np.arange(40)+1),np.average(test_score,axis=0)*100e0,yerr=np.std(test_score,axis=0)*100,fmt='o')
    plt.xlabel('nr. of features used')
    plt.ylabel('test score [%]')
    if standardize:
        plt.savefig('general_naive_bayes_standardized_bw'+str(bw)+'_'+str(n_sim)+'.png')
    else:
        plt.savefig('general_naive_bayes_bw'+str(bw)+'_'+str(n_sim)+'.png')
    plt.close()
    print 'done'



  return np.log(self.evaluate(x))


0.316227766017
    0
    10
    20
    30
    40
    50
    60
    70
    80
    90
done
0.354813389234
    0
    10
    20
    30
    40
    50
    60
    70
    80
    90
done
0.398107170553
    0
    10
    20
    30
    40
    50
    60
    70
    80
    90
done
0.446683592151
    0
    10
    20
    30
    40
    50
    60
    70
    80
    90
done
0.501187233627
    0
    10
    20


KeyboardInterrupt: 