In [6]:
import numpy as np
import pandas as pd
import random
import os.path
import math
from scipy.sparse.linalg import eigs

from collections import defaultdict
import matplotlib.pyplot as plt

In [11]:
def n_cross_val(A, n):
    for i in range(0, len(A), n):
        yield (A[:i]+A[i+n:], A[i:i + n])

In [24]:
class FLDA:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.nObs, self.nFeatures = X.shape
        assert(self.nObs==y.shape[0])
        self.allclasses = np.unique(y)
        self.nclass = len(self.allclasses)
        
    def build(self):
        self.Xeach = self._split_by_class(self.X, self.y)
        self._compute_all_mean()
        Sb,Sw = self._calculate_variances()
        evals,evecs = eigs( np.dot(np.linalg.inv(Sw), Sb), which='LM' )
        print("Eigen values are", sorted(evals))
        return evecs
    
    def _calculate_variances(self):
        sb = np.zeros((self.nFeatures, self.nFeatures))
        sw = np.zeros((self.nFeatures, self.nFeatures))
        for c in self.allclasses:
            sb += self.Xeach[c].shape[0] * np.outer(self.mueach[c] - self.mu, self.mueach[c] - self.mu)
            sw += self.Xeach[c].shape[0] * np.cov(self.Xeach[c], rowvar=0, bias=1)
        sw += np.identity(self.nFeatures)
        return (sb,sw)
            
    def _split_by_class(self, X, y):
        d={}
        for c in self.allclasses:
            Xc=X[np.where(y==c)[0],:]
            d[c]=Xc
        return d
    
    def _compute_all_mean(self):
        self.mueach={c:np.mean(self.Xeach[c], axis=0).reshape(-1,self.nFeatures) for c in self.allclasses}
        self.mu=np.mean(self.X, axis=0).reshape([-1,self.nFeatures])
    
    def test_size(self):
        for e in self.mueach.values():
            print(e)

In [25]:
def fishers_discriminants(filename,num_crossval):
    assert os.path.isfile(filename) and os.access(filename, os.R_OK)
    df=pd.read_csv(filename, sep=',', header = None)
    
    data = df.as_matrix()
    X=data[:, :-1]
    y=data[:, -1]
    del df, data
    
    X=X+np.random.normal(0, 0.0001, X.shape) #to prevent numerical problem
    
    if len(np.unique(y))>15:
        # if the target values are more than some reasonable no (15), we take that as binary classifier
        b = np.percentile(y, 50)
        f=np.vectorize(lambda x: 0 if x<b else 1)
        y=f(y)
        assert(X.shape[0]==y.shape[0])
        y=np.reshape(y, [X.shape[0], 1])
       
    
    indices = list(range(len(y)))
    random.shuffle(indices)


    for (train, test) in n_cross_val(indices, len(indices)//num_crossval):
        Y_train=y[train]
        Y_test=y[test]
        X_train=X[train]
        X_test=X[test]
    
        #xdf = pd.DataFrame(X_test)
        #ydf = pd.DataFrame(Y_test)
        
        #print(Y_test.shape)
        #print(X_train.shape)
        #print(xdf.head())
        
        W=FLDA(X_train,Y_train).build()
        #Z=np.zeros(X_train.shape[0])
        #Z=X_train.dot(W)
        #print(Z.shape)
        #z0=np.mean(Z, axis=0)
        #print(z0)
        
        #colors=['blue','red']
        #plt.hist(Z, bins=20, histtype = 'bar', facecolor = 'blue')
        #plt.ylabel("Predictions")
        #plt.xlabel("Bin Number")
        #plt.title("Histogram")
        #plt.show()

In [23]:
fishers_discriminants('boston.csv',10)

(6,)
Eigen values are [(-1.2557234341562288e-15+0j), (4.2104025807815999e-16-3.52817483406569e-16j), (4.2104025807815999e-16+3.52817483406569e-16j), (5.8704944986420395e-16+0j), (1.1266626138849171e-15+0j), (1.2958443590881523+0j)]
(6,)
Eigen values are [(-7.4370278345536906e-16-3.7395187940049144e-17j), (-7.4370278345536906e-16+3.7395187940049144e-17j), (6.6392722041027791e-16-6.1143243675156488e-16j), (6.6392722041027791e-16+6.1143243675156488e-16j), (1.3322676295501878e-14+0j), (1.2492059444748849+0j)]
(6,)
Eigen values are [(-1.5631940186722204e-13+0j), (-2.3390483247905029e-15+0j), (-8.4783583330801242e-16-2.1740180446290785e-15j), (-8.4783583330801242e-16+2.1740180446290785e-15j), (3.1797596374943351e-15+0j), (1.3196228953188864+0j)]
(6,)
Eigen values are [(-8.8817841970012523e-15+0j), (-2.1675124488268036e-15+0j), (-1.3344643366556613e-15+0j), (1.2814287676408682e-16-8.8752059008462975e-16j), (1.2814287676408682e-16+8.8752059008462975e-16j), (1.2584118680091372+0j)]
(6,)
Eigen v