Plot the correlation coefficient matrix and the cloud of points for every pair of variables. This helps to familiarize myself with the data, it allows me to check how strongly correlated some variables are and what type of correlation is there.

In [108]:
import numpy as np
import matplotlib.pyplot as plt

files = ['data-hl.txt','data-hs.txt','data-ne.txt','data-sl.txt','data-ss.txt']
data = np.zeros([5*300,42])

print 'read data files...'
i = 0
for file in files:
    j = 0
    for line in open(file,'r'):
        if line.split('\t')[0] != 'sclass':
            values = np.array(line.split('\t'))
            values[values == 'NA'] = 'nan'
            values[values == 'NA\n'] = 'nan'
            # save the class: points in data-hl.txt have class 0, points in data-hs.txt are in class 1, etc.
            data[i*300+j,0] = i
            data[i*300+j,1:] = values[1:].astype(float)
            j = j + 1
        else:
            header = line.split('\t')[1:]
    i = i + 1 
print 'done'    

print 'prepare the plots...'

# correlation coefficient matrix, masking out nans (didn't find a way to do this without for loops)
corr_coef = np.zeros([41,41])
for i in range(41):
    for j in range(41):
        a = data[:,i+1]
        b = data[:,j+1]
        mask_a = np.isfinite(a)
        mask_b = np.isfinite(b)
        mask = mask_a & mask_b
        corr_coef[i,j] = np.corrcoef([a[mask],b[mask]])[0,1]

plt.title('white - no corr., red - positive corr., blue - negative corr.')
plt.imshow(corr_coef,cmap="bwr",interpolation="nearest")
plt.savefig('corr_coef.png')
plt.close()

for i in range(41):
    for j in range(i):
        # i+1 and j+1 to skip the first row of classes
        for k in range(5):
            mask = (data[:,0] == k)
            plt.plot(data[mask,i+1],data[mask,j+1],'+')
        plt.xlabel(header[i])
        plt.ylabel(header[j])
        plt.savefig('imgs/'+header[i]+'-'+header[j]+'.png')
        plt.close()
        
print 'done'




read data files...
done
prepare the plots...
done


Class separability or class discriminatory power of the features.
Calculate the Fisher's discriminant ratio for each feature and rank the features in descending order. The first feature in the list has the highest class separability of all features. 

In [116]:
import numpy as np
import matplotlib.pyplot as plt

files = ['data-hl.txt','data-hs.txt','data-ne.txt','data-sl.txt','data-ss.txt']
data = np.zeros([5*300,42])

print 'read data files...'
i = 0
for file in files:
    j = 0
    for line in open(file,'r'):
        if line.split('\t')[0] != 'sclass':
            values = np.array(line.split('\t'))
            values[values == 'NA'] = 'nan'
            values[values == 'NA\n'] = 'nan'
            # save the class: points in data-hl.txt have class 0, points in data-hs.txt are in class 1, etc.
            data[i*300+j,0] = i
            data[i*300+j,1:] = values[1:].astype(float)
            j = j + 1
        else:
            header = line.split('\t')[1:]
            header = np.array(header)
    i = i + 1 
print 'done'    

print 'standardize data...'
# This is necessary because different features have different dynamic ranges. Standardization brings all features 
# to the same scale.

mean = np.mean(data,axis=0)
std = np.std(data,axis=0)

data_standard = np.zeros(np.shape(data))
data_mask = np.zeros(np.shape(data))
# save the classes
data_standard[:,0] = data[:,0]
data_mask = np.isfinite(data)
# standardize data
for i in range(41):
    a = data[:,i+1]
    mask_a = data_mask[:,i+1]
    mean = np.mean(a[mask_a]) 
    std = np.std(a[mask_a])
    
    data_standard[mask_a,i+1] = (a[mask_a] - mean)/std

print 'done'

print 'calculate Fisher discriminant ratio (FDR)...'
# FDR = sum sum (mu_i - mu_j)^2 / (sigma_i^2 + sigma_j^2), where i and j are different classes, mu is the mean, 
# sigma is the variance. Sum goes over all i-j pairs excluding i=j. 
# The idea is that features with large differences in the class-specific means and small variances in each class
# are better at distinguishing classes.
# For more details on FDR see e.g., Lin et al., J. Chem. Inf. Comput. Sci. 2004, 44, 76-87

FDR = np.zeros(41)
for i in range(41):
    FDR_sum = 0e0
    for j in range(5):
        mask_j = data_mask[:,i+1] & (data_standard[:,0] == j)
        for k in range(j):
            mask_k = data_mask[:,i+1] & (data_standard[:,0] == k)
            
            mu_j = np.mean(data_standard[mask_j,i+1])
            mu_k = np.mean(data_standard[mask_k,i+1])
            sigma_j = np.var(data_standard[mask_j,i+1])
            sigma_k = np.var(data_standard[mask_k,i+1])
            
            FDR_sum = FDR_sum + (mu_j-mu_k)**2e0 / (sigma_j**2e0 + sigma_k**2e0)
    # check the values     
    #print i,FDR_sum
    #for j in range(5):
    #    mask_j = data_mask[:,i+1] & (data_standard[:,0] == j)
    #    print '   ',np.mean(data_standard[mask_j,i+1])/np.var(data_standard[mask_j,i+1])
    FDR[i] = FDR_sum
print 'done'

print 'rank the features...'
indx_sorted[::-1] = np.argsort(FDR)
print '   features sorted in order of how well they discriminate between different classes (first item is best):'
print '  ',header[indx_sorted]

print 'done'

read data files...
done
standardize data...
done
calculate Fisher discriminant ratio (FDR)...
done
rank the features...
   features sorted in order of how well they discriminate between different classes (first item is best):
   ['ThetaPi_1' 'H2.H1_1' 'H1_1' 'Theta1Pi_1' 'H12_1' 'ThetaS_1' 'DAF_1'
 'Theta1S_1' 'TajD_1' 'FuLiF_1' 'Theta1L_1' 'FuLiD_1' 'DXPEHH_12'
 'FuLiF1_1' 'TajD1_1' 'DXPEHH_13\n' 'FuLiD1_1' 'FayWuH_1' 'FST_1'
 'XPEHH_12' 'SL1_1' 'H2_1' 'XPEHH_13' 'ThetaL_1' 'SL0_1' 'iHH0_1'
 'Theta1H_1' 'DDAF_1' 'Theta1Xi_1' 'nSL_1' 'iHS_1' 'FayWuH1_1' 'DnSL_1'
 'iHH1_1' 'ZengE1_1' 'MAF_1' 'ZengE_1' 'ZA_1' 'ThetaXi_1' 'DiHH_1'
 'ThetaH_1']
done


use an SVM to do classification
separate the data into training, cross validation, and test data sets (60-20-20%).
make a loop through successively less features:
    - use all features to find the best values for C and gamma in the SVM
    - fix the best C and gamma values and successively remove a feature that is least discriminative
    - check what number of features give the best score in classification