In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import pylab as pl
%matplotlib inline
import scipy as sp
import scipy.sparse as sps
from sklearn.cluster import KMeans
from sklearn.cross_validation import KFold
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC

<h2 style="color:purple">Load Dataset in Categorical and numerical Manner</h2>


In [4]:
def loadHeart():
    heart = pd.read_csv('data/new/Heart_disease.data',header=None)
    numeric = [0,3,4,7,9,11]
    nominal = [1,2,5,6,8,10,12]
    heartNum = heart[numeric].values
    heartCat = heart[nominal].values-1
    heartY = heart[13].values
    return heartCat,heartNum,heartY-1

def f12(x):
    if x==3:
        return 0
    if x==6:
        return 1
    return 2
def loadStatlog():
    statlog = pd.read_csv('data/Categorical/statlog.data',header=None)
    numeric = [0,3,4,7,9,11]
    nominal = [1,2,5,6,8,10,12]
    statlogNum = statlog[numeric].values
    statlogCat = statlog[nominal]
    statlogCat.loc[:,2] = statlogCat[2]-1
    statlogCat.loc[:,10] = statlogCat[10]-1
    statlogCat.loc[:,12] = statlogCat[12].apply(f12)
    statlogCat = statlogCat.values
    statlogY = statlog[13].values
    return statlogCat,statlogNum,statlogY-1

valDict = {}
def loadGerman():
    german = pd.read_csv('data/Categorical/german.data',header=None)
    categoricG = [0,2,3,5,6,7,8,9,11,13,14,15,16,17,18,19]
    numericG = [1,4,12]
    for col in categoricG:
        values = np.unique(sorted(german[col]))
        valDict = {}
        for idx in xrange(len(values)):
            valDict[values[idx]] = idx
        german.loc[:,col] = german[col].apply(lambda x: valDict[x])
    germanNum = german[numericG].values
    germanCat = german[categoricG].values
    germanY = german[20].values-1
    return germanCat,germanNum,germanY

def loadSoybean():
    soybean = pd.read_csv('data/Categorical/soyBean.data',header=None)
    for col in soybean.columns:
        values = np.unique(sorted(soybean[col]))
        valDict = {}
        for idx in xrange(len(values)):
            valDict[values[idx]] = idx
        soybean.loc[:,col] = soybean[col].apply(lambda x: valDict[x])
    soybeanCat = soybean.drop(20,axis=1).values
    soybeanY = soybean[20].values-1
    return soybeanCat,soybeanY

def loadDermat():
    dermat = pd.read_csv('data/Categorical/dermatology.data',header=None)
    dermatCat = dermat.drop([33,34],axis=1).values
    dermatNum = dermat[[33]].values
    dermatY = dermat[34].values-1
    return dermatCat,dermatNum,dermatY

<h2 style="color:purple">Load simple Dataset</h2>


In [5]:
def loadHeart2():
    heart = pd.read_csv('data/new/Heart_disease.data',header=None)
    numeric = [0,3,4,7,9,11]
    nominal = [1,2,5,6,8,10,12]
    for col in nominal:
        values = np.unique(sorted(heart[col]))
        if len(values)<=2:
            continue
        for val in values:
            name = str(col)+'_'+str(val)
            heart[name] = heart[col].apply(lambda x: 1 if x==val else 0)
        heart = heart.drop([col],axis=1)
    heartX = heart.drop(13).values
    heartY = heart[13].values-1
    return heartX,heartY

def f12(x):
    if x==3:
        return 0
    if x==6:
        return 1
    return 2

def loadStatlog2():
    statlog = pd.read_csv('data/Categorical/statlog.data',header=None)
    statlog.loc[:,2] = statlog[2]-1
    statlog.loc[:,10] = statlog[10]-1
    statlog.loc[:,12] = statlog[12].apply(f12)
    statlogX = statlog.drop(13,axis=1).values
    statlogY = statlog[13].values
    return statlogX,statlogY-1

valDict = {}
def loadGerman2():
    german = pd.read_csv('data/Categorical/german.data',header=None)
    categoricG = [0,2,3,5,6,7,8,9,11,13,14,15,16,17,18,19]
    for col in categoricG:
        values = np.unique(sorted(german[col]))
        valDict = {}
        for idx in xrange(len(values)):
            valDict[values[idx]] = idx
        german.loc[:,col] = german[col].apply(lambda x: valDict[x])
    germanX = german.drop(20).values
    germanY = german[20].values-1
    return germanX,germanY


def loadDermat2():
    dermat = pd.read_csv('data/Categorical/dermatology.data',header=None)
    dermatX = dermat.drop([34],axis=1).values
    dermatY = dermat[34].values-1
    return dermatX,dermatY

<h2 style="color:purple">Ocil dimension reduction</h2>


In [4]:
def ocil_mixed_class_prob(x_categoric,x_numeric,freq_in_cluster,cluster_cnt,num_sum,feature_weight):
    d_c = x_categoric.shape[1]
    d_n = x_numeric.shape[1]
    k = freq_in_cluster.shape[2]

    cat_proba = np.zeros((d_c,k))
    for i in xrange(d_c):
        for c in xrange(k):
            cat_proba[i,c] = freq_in_cluster[i,x_categoric[0,i],c]/cluster_cnt[0,c]
    
    #object cluster similarity
    average_numeric = np.zeros((k, d_n))
    for i in xrange(k):
        average_numeric[i, :] = num_sum[i, :]/cluster_cnt[0,i]

    
    num_proba = np.zeros((1, k))
    for i in xrange(k):
        differen = x_numeric - average_numeric[i, :]
        num_proba[0,i] = np.exp(-0.5*np.dot(differen,differen.T))
    num_proba = num_proba/np.sum(num_proba)
    
    attribute_proba = np.zeros((d_c+1,k))
    attribute_proba[:d_c,:] = cat_proba
    attribute_proba[d_c,:] = num_proba

    class_proba = np.dot(attribute_proba.T,feature_weight.T)
    return class_proba
def ocil_mixed_class_assign(x_categoric,x_numeric,freq_in_cluster,cluster_cnt,num_sum,feature_weight):
    class_proba = ocil_mixed_class_prob(x_categoric,x_numeric,freq_in_cluster,cluster_cnt,num_sum,feature_weight)
    return class_proba.argmax()

In [5]:
def construct_graph(x_cat,x_num,y):
    numP = 2
    OCIL(x_cat,x_num,y,numP,4)
    

In [6]:

def construct_ocil_graph(x_cat, x_num, k, m):
    n = x_cat.shape[0]
    d_c = x_cat.shape[1]
    d_n = x_num.shape[1] 
    d = d_c + d_n

    # normalize numeric part
    x_num = (x_num - x_num.mean(axis=0))/x_num.std(axis=0, ddof=1)
#     print 'start iteration'
    # start iteration ---------------------------------------------
    sample_freq = np.zeros((m,d_c))

    for i in xrange(n):
        for j in xrange(d_c):
            sample_freq[x_cat[i,j],j] = sample_freq[x_cat[i,j],j]+1

    num_diff_val = np.zeros((1,d_c))
    for i in xrange(d_c):
        num_diff_val[0,i] = np.count_nonzero(sample_freq[:,i])

    # find information value
    find_inf_val = np.vectorize(lambda x: 0 if x==0 else - (x)*np.log2(x))
    val_info = find_inf_val(sample_freq/n) # for each entry of sample_freq :-)


    # each attribute entropy - Categorical
    attrib_entropy = np.sum(val_info,axis=0)/num_diff_val

    feature_weight_cat = (attrib_entropy/np.sum(attrib_entropy))*(d_c*1.0/(d_c+1));
    feature_weight_num = np.array([[1.0/(d_c + 1)]])
    feature_weight = np.zeros((1,d_c+1))
    feature_weight[0,:d_c] = feature_weight_cat
    feature_weight[0,d_c] = feature_weight_num


    # random initialize k seed points
    ran = np.arange(n)
    np.random.shuffle(ran)
    mu_cat = x_cat[ran[:k]]
    mu_num = x_num[ran[:k]]

    num_sum = mu_num.copy() #########
    freq_in_cluster = np.zeros((d_c,m,k)) # (num of cat, value of each cat, num of clusters)
    for c in xrange(k):
        for j in xrange(d_c):
            freq_in_cluster[j,mu_cat[c,j],c] = freq_in_cluster[j,mu_cat[c,j],c]+1

    cluster_label = np.zeros((1, n));
    cluster_cnt = np.ones((1, k));

    for i in xrange(n):
        class_label = ocil_mixed_class_assign(x_cat[[i],:], x_num[[i],:], freq_in_cluster, cluster_cnt, num_sum, feature_weight)
        cluster_label[0,i] = class_label
        cluster_cnt[0,class_label] = cluster_cnt[0,class_label]+1
        num_sum[class_label, :] = num_sum[class_label, :] + x_num[i, :] 

        for j in xrange(d_c):
            freq_in_cluster[j, x_cat[i,j], class_label] = freq_in_cluster[j, x_cat[i,j], class_label]+1;

#     print 'iterative step'

    # iterative steps
    moves = 1
    epoch = 0

    while moves != 0: # Some class label has been changed in the last step%%%
        moves = 0
        for i in xrange(n):

            new_label = ocil_mixed_class_assign(x_cat[[i],:], x_num[[i],:], freq_in_cluster, cluster_cnt, num_sum, feature_weight)


            if cluster_label[0,i] != new_label:  # need adjustment
                moves = moves + 1
                old_cluster = cluster_label[0,i]
                cluster_label[0,i] = new_label    # updata cluster label
                cluster_cnt[0,new_label] = cluster_cnt[0,new_label] + 1
                cluster_cnt[0,old_cluster] = cluster_cnt[0,old_cluster] - 1   # update cluster counter

                num_sum[new_label, :] = num_sum[new_label, :] + x_num[i, :]
                num_sum[old_cluster, :] = num_sum[old_cluster, :] - x_num[i, :]

                for j in xrange(d_c):        # update the information of the two clusters
                    freq_in_cluster[j, x_cat[i,j], new_label] = freq_in_cluster[j, x_cat[i,j], new_label] + 1       
                    freq_in_cluster[j, x_cat[i,j], old_cluster] = freq_in_cluster[j, x_cat[i,j], old_cluster] - 1 

        epoch = epoch + 1
#     print cluster_label
    # construct similarity between sample and clusters
    simMat = np.zeros((n,k))
    for i in xrange(n):
        tmp = ocil_mixed_class_prob(x_cat[[i],:], x_num[[i],:], freq_in_cluster, cluster_cnt, num_sum, feature_weight)
        simMat[i,:] = tmp[:,0].T

        if cluster_label[0,i] != new_label:  # need adjustment
            moves = moves + 1
            old_cluster = cluster_label[0,i]
            cluster_label[0,i] = new_label    # updata cluster label
            cluster_cnt[0,new_label] = cluster_cnt[0,new_label] + 1
            cluster_cnt[0,old_cluster] = cluster_cnt[0,old_cluster] - 1   # update cluster counter

            num_sum[new_label, :] = num_sum[new_label, :] + x_num[i, :]
            num_sum[old_cluster, :] = num_sum[old_cluster, :] - x_num[i, :]

            for j in xrange(d_c):        # update the information of the two clusters
                freq_in_cluster[j, x_cat[i,j], new_label] = freq_in_cluster[j, x_cat[i,j], new_label] + 1       
                freq_in_cluster[j, x_cat[i,j], old_cluster] = freq_in_cluster[j, x_cat[i,j], old_cluster] - 1 
    return simMat

In [7]:

def bipartiteUnsupervised(x_cat,x_num,numP,numM):
    numS = x_cat.shape[0]
    sampleCluster = construct_ocil_graph(x_cat,x_num,numP,numM)
    N = numS + numP
    W = sps.lil_matrix((N,N))
    for i in xrange(numS):
        for j in xrange(numP):
            W[i,numS+j] = sampleCluster[i,j]
            W[numS+j,i] = W[i,numS+j]
    W = sps.csr_matrix(W)
    d = np.array(W.sum(axis=0))
    D = sps.diags(d,[0])
    L = D - W
    return W,D,L

def bipartiteSupervised(x_cat,x_num,y,numP,numM,train_index,test_index):
    numS = y.shape[0]
    numL = len(np.unique(sorted(y)))
    sampleCluster = construct_ocil_graph(x_cat,x_num,numP,numM)
    N = numS+numP+numL
    W = sps.lil_matrix((N,N))
    for i in xrange(numS):
        for j in xrange(numP):
            W[i,numS+j] = sampleCluster[i,j]#similarity(X[i],centers[j],sigma)
            W[numS+j,i] = W[i,numS+j]
#     print "len train: ",len(train_index)
    P = []
    for i in xrange(numL):
        P.append(sum(y==i))
        
    for i in train_index:
        W[i,numS+numP+y[i]] = 1./(3*P[y[i]])
        W[numS+numP+y[i],i] = 1./(3*P[y[i]])


    W = sps.csr_matrix(W)
    d = np.array(W.sum(axis=0))
    D = sps.diags(d,[0])
    L = D - W
    return W,D,L



In [7]:
from sklearn import neighbors
def unsupervised10Fold(Xcat,Xnum,y,numP,numM):
#     numP = 40
    numS = Xcat.shape[0]
    acc = []
    kf = KFold(numS,n_folds=10,shuffle=True)
    i = 0
    for train_index,test_index in kf:
        W,D,L = bipartiteUnsupervised(Xcat,Xnum,numP,numM)
        vals, vecs = sps.linalg.eigs(L, M=D, k=7)
        vals = vals.real
        vecs = vecs.real[:numS]
        yTrain = y[train_index]
        yTest = y[test_index]
        newRepTrain = vecs[:,1:7][train_index]
        newRepTest = vecs[:,1:7][test_index]
        NN = neighbors.KNeighborsClassifier(n_neighbors=2)
        NN.fit(newRepTrain,yTrain)
        XPred = NN.predict(newRepTest)
        acc.append(np.sum(XPred==yTest)*1.0/yTest.shape[0])
#         print i,":",acc[i]
        i += 1
    return np.mean(acc),np.std(acc)

def supervised10Fold(x_cat,x_num,y,numP,numM):
#     numP = 40
    numS = y.shape[0]
    acc = []
    kf = KFold(numS,n_folds=10,shuffle=True)
    i = 0
    for train_index,test_index in kf:
        W,D,L = bipartiteSupervised(x_cat,x_num,y,numP,numM,train_index,test_index)
        vals, vecs = sps.linalg.eigs(L, M=D, k=7)
        vals = vals.real
        vecs = vecs.real[:numS]
        yTrain = y[train_index]
        yTest = y[test_index]
        newRepTrain = vecs[:,1:7][train_index]
        newRepTest = vecs[:,1:7][test_index]
        NN = neighbors.KNeighborsClassifier(n_neighbors=2)
        NN.fit(newRepTrain,yTrain)
        XPred = NN.predict(newRepTest)
        acc.append(np.sum(XPred==yTest)*1.0/yTest.shape[0])
#         print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)

In [13]:
from sklearn.decomposition import FastICA
def ICA10Fold(X,y):
    acc = []
    kf = KFold(X.shape[0],n_folds=10,shuffle=True)
    i = 0
    for train_index,test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        clf =  FastICA()
        clf.fit(X[train_index])
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        nclf = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain,yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred==yTest)*1.0/yTest.shape[0])
#         print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)

from sklearn.decomposition import PCA, KernelPCA
def KPCA10Fold(X,y):
    acc = []
    kf = KFold(X.shape[0],n_folds=10,shuffle=True)
    i = 0
    for train_index,test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        clf = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
        clf.fit(X[train_index])
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        nclf = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain,yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred==yTest)*1.0/yTest.shape[0])
#         print i,":",acc[i]
        i += 1
    return np.mean(acc),np.std(acc)

from sklearn.lda import LDA
def LDA10Fold(X,y):
    acc = []
    kf = KFold(X.shape[0],n_folds=10,shuffle=True)
    i = 0
    for train_index,test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        clf = LDA()
        clf.fit(X[train_index], yTrain)
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        nclf = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain,yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred==yTest)*1.0/yTest.shape[0])
#         print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)
from sklearn.decomposition import PCA
def PCA10Fold(X,y):
    acc = []
    kf = KFold(X.shape[0],n_folds=10,shuffle=True)
    i = 0
    for train_index,test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        clf = PCA()
        clf.fit(X[train_index])
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        nclf = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain,yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred==yTest)*1.0/yTest.shape[0])
#         print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)

from sklearn import manifold
def LLE10Fold(X,y):
    acc = []
    kf = KFold(X.shape[0],n_folds=10,shuffle=True)
    i = 0
    for train_index,test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        n_neighbors = 30
        clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2,method='standard')
        clf.fit(X[train_index])
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        nclf = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain,yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred==yTest)*1.0/yTest.shape[0])
#         print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)


def isomap10Fold(X,y):
    acc = []
    kf = KFold(X.shape[0],n_folds=10,shuffle=True)
    i = 0
    for train_index,test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        n_neighbors = 30
        clf = manifold.Isomap(n_neighbors, n_components=2)
        clf.fit(X[train_index])
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        nclf = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain,yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred==yTest)*1.0/yTest.shape[0])
#         print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)


<h2 style="color:purple">result: heart</h2>

In [456]:
k = 2 # number of clusters
m = 4 # number of max categorical values
x_cat , x_num , y = loadHeart()
for P in [10,15,20,25,30,35,40]:
    mn,st = unsupervised10Fold(x_cat,x_num,y,P,m)
    print P,mn,st

10 0.749784946237 0.0670334171508
15 0.749247311828 0.0510040250147
20 0.729032258065 0.102551561441
25 0.745268817204 0.0841770675585
30 0.715698924731 0.0741704810642
35 0.732580645161 0.0639564432948
40 0.735483870968 0.0722753435562


In [457]:
k = 2 # number of clusters
m = 4 # number of max categorical values
x_cat , x_num , y = loadHeart()
for P in [10,15,20,25,30,35,40]:
    mn,st = supervised10Fold(x_cat,x_num,y,P,m)
    print P,mn,st

10 0.732258064516 0.104395257334
15 0.761720430108 0.0783814178511
20 0.759139784946 0.0467166869332
25 0.728817204301 0.0629226144184
30 0.751505376344 0.0861590969502
35 0.755376344086 0.0834068354306
40 0.742795698925 0.0823262771498


In [14]:
X,y = loadHeart2()
methods = [ICA10Fold,PCA10Fold,LDA10Fold,KPCA10Fold,LLE10Fold,isomap10Fold]
for method in methods:
    print method.__name__[:-6]
    print method(X,y)

ICA
(0.52634408602150529, 0.086178955367585838)
PCA
(0.53645161290322574, 0.053106633175376082)
LDA
(0.50344086021505374, 0.084544852707424789)
KPCA
(0.53311827956989244, 0.084469603325683845)
LLE
(0.52021505376344079, 0.077389181694884596)
isomap
(0.5335483870967741, 0.049394062236328223)


<h2 style="color:purple">result: statlog</h2>

In [458]:
k = 2 # number of clusters
m = 4 # number of max categorical values
x_cat , x_num , y = loadStatlog()
for P in [10,15,20,25,30,35,40]:
    mn,st = unsupervised10Fold(x_cat,x_num,y,P,m)
    print P,mn,st

10 0.722222222222 0.095509607098
15 0.725925925926 0.0814814814815
20 0.733333333333 0.0568973759101
25 0.711111111111 0.0592592592593
30 0.737037037037 0.0749027719117
35 0.707407407407 0.0749027719117
40 0.72962962963 0.0759936464025


In [459]:
k = 2 # number of clusters
m = 4 # number of max categorical values
x_cat , x_num , y = loadStatlog()
for P in [10,15,20,25,30,35,40]:
    mn,st = supervised10Fold(x_cat,x_num,y,P,m)
    print P,mn,st

10 0.711111111111 0.0518518518519
15 0.755555555556 0.0814814814815
20 0.714814814815 0.0795218909392
25 0.737037037037 0.0749027719117
30 0.733333333333 0.0544331053952
35 0.711111111111 0.0948610998138
40 0.725925925926 0.0528994698411


In [15]:
X,y = loadStatlog2()
methods = [ICA10Fold,PCA10Fold,LDA10Fold,KPCA10Fold,LLE10Fold,isomap10Fold]
for method in methods:
    print method.__name__[:-6]
    print method(X,y)

ICA
(0.73333333333333328, 0.049135182079339278)
PCA
(0.62962962962962954, 0.054934803608116006)
LDA
(0.80370370370370359, 0.057496943319481568)
KPCA
(0.55555555555555558, 0.09658077637337259)
LLE
(0.61111111111111116, 0.083230389089793452)
isomap
(0.58518518518518525, 0.093403853429025879)


<h2 style="color:purple">result: German</h2>

In [460]:
k = 2 # number of clusters
m = 10 # number of max categorical values
x_cat , x_num , y = loadGerman()

for P in [10,15,20,25,30,35,40]:
    mn,st = unsupervised10Fold(x_cat,x_num,y,P,m)
    print P,mn,st

10 0.707 0.0426731765867
15 0.686 0.0560713830755
20 0.703 0.0357910603363
25 0.706 0.054258639865
30 0.697 0.044283179651
35 0.704 0.0504380808517
40 0.699 0.0432319326424


In [461]:
k = 2 # number of clusters
m = 10 # number of max categorical values
x_cat , x_num , y = loadGerman()

for P in [10,15,20,25,30,35,40]:
    mn,st = supervised10Fold(x_cat,x_num,y,P,m)
    print P,mn,st

10 0.705 0.0514295634825
15 0.715 0.030083217913
20 0.696 0.0387814388593
25 0.699 0.0372692903072
30 0.71 0.03577708764
35 0.697 0.0504083326445
40 0.683 0.0374299345444


In [16]:
X,y = loadHeart2()
methods = [ICA10Fold,PCA10Fold,LDA10Fold,KPCA10Fold,LLE10Fold,isomap10Fold]
for method in methods:
    print method.__name__[:-6]
    print method(X,y)

ICA
(0.51612903225806439, 0.066028860225686173)
PCA
(0.54322580645161289, 0.06293914974562187)
LDA
(0.54354838709677422, 0.10750671853963208)
KPCA
(0.48376344086021505, 0.085522004073305777)
LLE
(0.51989247311827946, 0.062715462806716984)
isomap
(0.56204301075268814, 0.10749682375580148)


<h2 style="color:purple">result: Dermatology</h2>

In [12]:
k = 2 # number of clusters
m = 4 # number of max categorical values
x_cat , x_num , y = loadDermat()
for P in [10,15,20,25,30,35,40]:
    mn,st = unsupervised10Fold(x_cat,x_num,y,P,m)
    print P,mn,st

10 0.876904761905 0.0590301753548
15 0.840714285714 0.0532988952495
20 0.832698412698 0.0505422847699
25 0.829841269841 0.0626252335649
30 0.86619047619 0.0640158946627
35 0.87746031746 0.0657181196642
40 0.86626984127 0.0488920453526




In [34]:
def bipartiteSupervised(x_cat,x_num,y,numP,numM,train_index,test_index):
    numS = y.shape[0]
    numL = len(np.unique(sorted(y)))
    sampleCluster = construct_ocil_graph(x_cat,x_num,numP,numM)
    N = numS+numP+numL
    W = sps.lil_matrix((N,N))
    for i in xrange(numS):
        for j in xrange(numP):
            W[i,numS+j] = sampleCluster[i,j]#similarity(X[i],centers[j],sigma)
            W[numS+j,i] = W[i,numS+j]
#     print "len train: ",len(train_index)
    P = []
    for i in xrange(numL):
        P.append(sum(y==i))
        
    for i in train_index:
        W[i,numS+numP+y[i]] = 1./(2.5*P[y[i]])
        W[numS+numP+y[i],i] = 1./(2.5*P[y[i]])


    W = sps.csr_matrix(W)
    d = np.array(W.sum(axis=0))
    D = sps.diags(d,[0])
    L = D - W
    return W,D,L

In [35]:
k = 2 # number of clusters
m = 4 # number of max categorical values
x_cat , x_num , y = loadDermat()
mxM = 0
mxE = 0
for P in range(10,60):
    mn,st = supervised10Fold(x_cat,x_num,y,P,m)
    print P,mn,st
    if mn>mxM:
        mxM = mn
        mxE = st
print "result:",mxM,mxE

10 0.743095238095 0.0874198693667
11 0.748412698413 0.0565121721259
12 0.790158730159 0.0867422195244
13 0.793333333333 0.078167310916
14 0.795634920635 0.0475611420416
15 0.810158730159 0.0802975381529
16 0.835476190476 0.0921503763526
17 0.829920634921 0.0465448279925
18 0.832222222222 0.045528447219
19 0.837698412698 0.0716594592871
20 0.851984126984 0.058318214216
21 0.857222222222 0.0902360011484
22 0.843412698413 0.0440544825198
23 0.854761904762 0.0624608721058
24 0.840793650794 0.0929799757374
25 0.869126984127 0.0743303437366
26 0.877142857143 0.0433493196862
27 0.857619047619 0.0453440696904
28 0.860238095238 0.0489695402573
29 0.860079365079 0.0620470992844
30 0.86873015873 0.0777204194073
31 0.838253968254 0.0770371278853
32 0.865952380952 0.0390205900774
33 0.846428571429 0.0355326066813
34 0.868492063492 0.0674939691977
35 0.865634920635 0.0521000766738
36 0.866111111111 0.0671946683658
37 0.865952380952 0.0445597286005
38 0.874365079365 0.0333428746737
39 0.846587301587 

In [17]:
X,y = loadDermat2()
methods = [ICA10Fold,PCA10Fold,LDA10Fold,KPCA10Fold,LLE10Fold,isomap10Fold]
for method in methods:
    print method.__name__[:-6]
    print method(X,y)

  X -= X_mean[:, np.newaxis]
  X -= self.mean_
  X -= self.mean_
  X -= X_mean[:, np.newaxis]
  X -= self.mean_
  X -= self.mean_
  X -= X_mean[:, np.newaxis]
  X -= self.mean_
  X -= self.mean_
  X -= X_mean[:, np.newaxis]
  X -= self.mean_
  X -= self.mean_
  X -= X_mean[:, np.newaxis]
  X -= self.mean_
  X -= self.mean_
  X -= X_mean[:, np.newaxis]
  X -= self.mean_
  X -= self.mean_
  X -= X_mean[:, np.newaxis]
  X -= self.mean_
  X -= self.mean_
  X -= X_mean[:, np.newaxis]
  X -= self.mean_
  X -= self.mean_
  X -= X_mean[:, np.newaxis]
  X -= self.mean_
  X -= self.mean_
  X -= X_mean[:, np.newaxis]
  X -= self.mean_


ICA
(0.821031746031746, 0.077993909291824273)
PCA
(0.8941269841269841, 0.049060459612654331)
LDA
(0.95817460317460301, 0.033384409092735975)
KPCA
(0.25142857142857145, 0.087938370274245475)
LLE
(0.51738095238095227, 0.12730834107089462)
isomap
(0.68412698412698414, 0.048150473808592359)


  X -= self.mean_


In [124]:
# wine  -------------------------------------------------------------------------------------------------
def loadWine():
    wineData = pd.read_csv('data/wine.data',header=None)
    wineY = wineData[0].apply(lambda x: x-1).values
    wineX = wineData.drop(0,axis=1)
    wineX = wineX.values
    return wineX,wineY

# wine  -------------------------------------------------------------------------------------------------
def loadWineNorm():
    wineData = pd.read_csv('data/wine.data',header=None)
    wineY = wineData[0].apply(lambda x: x-1).values
    wineX = wineData.drop(0,axis=1)
    wineX = wineX.values
    wineX = (wineX - wineX.mean(axis=0))/(wineX.max(axis=0)-wineX.min(axis=0))
    return wineX,wineY


# glass  -------------------------------------------------------------------------------------------------
def loadGlass():
    glassData = pd.read_csv('data/glass.data',header=None)
    glassY = glassData[10].apply(lambda x: x-1 if x<4 else x-2).values
    glassX = glassData.drop([0,10],axis=1)
    glassX = glassX.values
    return glassX,glassY
    
# sonar  -------------------------------------------------------------------------------------------------
def loadSonar():
    sonarData = pd.read_csv('data/sonar.data',header=None)
    sonarY = (sonarData[60].apply(lambda x: 0 if x=='M' else 1)).values
    sonarX = sonarData.drop(60,axis=1)
    sonarX = sonarX.values
    return sonarX,sonarY

# iris  -------------------------------------------------------------------------------------------------
def irisLableSet(name):
    names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
    for i in xrange(3):
        if names[i] == name:
            return i
def loadIris():
    irisData = pd.read_csv('data/iris.data',header=None)
    irisY = irisData[4].apply(irisLableSet).values
    irisX = irisData.drop(4,axis=1)
    irisX = irisX.values
    return irisX,irisY
    
# mnist   -------------------------------------------------------------------------------------------------
import scipy.io
def loadMnist():
    mnist = scipy.io.loadmat('data/mnistAll.mat')
    trains = []
    labels = []
    tests = []
    for i in range(10):
        trains.append(mnist['train'+str(i)])
        num = mnist['train'+str(i)].shape[0]
        labels.append(i*np.ones(num,dtype=int))
    mnistX = np.concatenate(trains)
    mnistY = np.concatenate(labels)
    return mnistX,mnistY
    
# breast cancer  -------------------------------------------------------------------------------------------------
def loadWdbc():
    wdbcData = pd.read_csv('data/wdbc.data',header=None)
    wdbcY = (wdbcData[1].apply(lambda x: 0 if x=='M' else 1)).values
    wdbcX = wdbcData.drop([0,1],axis=1)
    wdbcX = wdbcX.values
    return wdbcX,wdbcY

# usps   -------------------------------------------------------------------------------------------------
def loadUsps():
    with open('data/usps/usps_train.jf','r') as f:
        i = 0
        lines = f.readlines()
        uspsX = np.zeros((len(lines),256))
        uspsY = np.zeros(len(lines),dtype=int)
        for line in lines:
            splitedLine = line.split()
            features = map(float,splitedLine[1:])
            uspsY[i] = int(splitedLine[0])
            for j in xrange(256):
                uspsX[i,j] = features[j]
            i += 1
    return uspsX,uspsY

# waveform  -------------------------------------------------------------------------------------------------
def loadWaveform():
    waveformData = pd.read_csv('data/waveForm/waveform.data',header=None)
    waveformY = waveformData[21].values
    waveformX = waveformData.drop(21,axis=1)
    waveformX = waveformX.values
    return waveformX,waveformY   


def generateTwoGaussian():
    mean1 = [-2, -2]
    cov1 = [[1, 0], [0, 1]]  # diagonal covariance
    mean2 = [2,2]
    cov2 = [[1, 0], [0, 1]]  # diagonal covariance

    # data
    X = np.zeros((200,2))
    y = np.zeros(200,dtype=int)
    X[:100] = np.random.multivariate_normal(mean1, cov1, 100)
    X[100:] = np.random.multivariate_normal(mean2, cov2, 100)
    y[100:] = np.ones(100)
    return X,y

def generateXOR():
    mean1 = [-2, -2]
    cov1 = [[1, 0], [0, 1]]  # diagonal covariance
    mean2 = [2,2]
    cov2 = [[1, 0], [0, 1]]  # diagonal covariance

    # data
    X = np.zeros((400,2))
    y = np.zeros(400,dtype=int)
    X[:100] = np.random.multivariate_normal(mean1, cov1, 100)
    X[100:200] = np.random.multivariate_normal(mean2, cov2, 100)
    
    
    mean3 = [2, -2]
    cov3 = [[1, 0], [0, 1]]  # diagonal covariance
    mean4 = [-2, 2]
    cov4 = [[1, 0], [0, 1]]  # diagonal covariance

    X[200:300] = np.random.multivariate_normal(mean3, cov3, 100)
    X[300:] = np.random.multivariate_normal(mean4, cov4, 100)
    y[200:] = np.ones(200)
    
    return X,y

def generateTwoParabola():
    x1 = np.linspace(-10, 10, 40)
    y1 = .22*x1**2 + .024*x1 + .04  

    x2 = np.linspace(0, 20, 40)
    y2 = -.2*(x2-10)**2 - .024*(x2-10) + 35

    size = x1.shape[0]
    EACH = 10
    X = np.zeros((2*size*EACH,2))
    y = np.zeros(2*size*EACH,dtype=int)
    y[size*EACH:] = np.ones(size*EACH)

    cov = [[1, 0], [0, 1]]  # diagonal covariance
    for i in range(size):
        X[i*EACH:(i+1)*EACH] = np.random.multivariate_normal([x1[i],y1[i]], cov, EACH)
        X[(i+size)*EACH:(i+size+1)*EACH] = np.random.multivariate_normal([x2[i],y2[i]], cov, EACH)
    return X,y
def generateTwoDisks():
    angle = np.arange(0, np.pi*2, 0.15)

    r1 = 50
    r2 = 20
    size = angle.shape[0]

    x1 = r1 * np.cos(angle)
    y1 = r1 * np.sin(angle)

    x2 = r2 * np.cos(angle)
    y2 = r2 * np.sin(angle)


    EACH = 10
    X = np.zeros((2*size*EACH,2))
    Y = np.zeros(2*size*EACH,dtype=int)
    Y[size*EACH:] = np.ones(size*EACH)

    cov = [[5, 0], [0, 5]]  # diagonal covariance
    for i in range(size):
        X[i*EACH:(i+1)*EACH] = np.random.multivariate_normal([x1[i],y1[i]], cov, EACH)
        X[(i+size)*EACH:(i+size+1)*EACH] = np.random.multivariate_normal([x2[i],y2[i]], cov, EACH)
    return X,Y
import scipy.io as io
def generateConcentric():
    concentric = io.loadmat("Concentric_rings.mat")
    X = concentric['X'][:,[0,1]]
    Y = concentric['Y'][:,0]
    Y = (Y==1)
    return X,Y
#----------------------------------------------------------------------------------------------------------------

def loadHillValey():
    hill = pd.read_csv('data/new/Hill_Valley_with_noise_Training.data')
    hillT = pd.read_csv('data/new/Hill_Valley_with_noise_Testing.data')
    h = pd.concat([hill,hillT])
#     h.index = range(h.shape[0])
    hillX = h.drop('class',axis=1).values
    hillY = h['class'].values
    return hillX,hillY
def loadIonosphere():
    ionosphere = pd.read_csv('data/new/ionosphere.data',header=None)
    ionX = ionosphere.drop(34,axis=1).values
    ionY = ionosphere[34].apply(lambda x: 0 if x=='g' else 1).values
    return ionX,ionY

In [125]:
datasets = [generateTwoDisks,generateXOR,generateTwoParabola, generateConcentric,
            loadWineNorm,loadGlass,loadIris,loadWdbc,loadSonar,loadWaveform,loadUsps]
for data in datasets:
    X,y = data()
    print data.__name__
    print X.shape
    print np.unique(sorted(y))

generateTwoDisks
(840L, 2L)
[0 1]
generateXOR
(400L, 2L)
[0 1]
generateTwoParabola
(800L, 2L)
[0 1]
generateConcentric
(313L, 2L)
[False  True]
loadWineNorm
(178L, 13L)
[0 1 2]
loadGlass
(214L, 9L)
[0 1 2 3 4 5]
loadIris
(150L, 4L)
[0 1 2]
loadWdbc
(569L, 30L)
[0 1]
loadSonar
(208L, 60L)
[0 1]
loadWaveform
(5000L, 21L)
[0 1 2]
loadUsps
(7291L, 256L)
[0 1 2 3 4 5 6 7 8 9]


In [126]:
heartX,heartY = loadHeart2()
print heartX.shape
print np.unique(sorted(heartY))

(302L, 14L)
[0 1]


In [74]:
hill = pd.read_csv('data/new/Hill_Valley_with_noise_Training.data')#,header=None)
hill.tail()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X92,X93,X94,X95,X96,X97,X98,X99,X100,class
601,2064.63,2174.44,2249.02,2249.6,2307.61,2328.84,2071.9,2107.05,2242.68,2183.03,...,2202.95,2248.56,2185.22,2145.38,2288.64,2288.42,2096.08,2114.25,2281.91,0
602,9.42,9.3,9.66,9.63,9.15,9.91,9.45,9.84,9.36,9.15,...,9.85,9.76,9.82,9.64,9.4,9.53,9.23,9.82,9.57,0
603,5.31,5.03,5.17,5.02,5.2,5.16,5.11,5.27,5.19,5.13,...,5.19,5.06,5.28,5.28,5.19,5.27,5.14,5.12,5.04,0
604,1.03,1.02,1.09,1.06,1.03,1.03,0.98,0.97,1.13,0.97,...,1.11,1.06,0.96,1.06,1.01,1.07,1.1,1.11,1.07,0
605,61.77,58.63,63.06,58.44,57.66,67.63,58.16,61.59,60.71,58.07,...,59.52,58.88,63.51,57.27,60.08,62.75,65.13,59.74,63.86,0


In [133]:
spect = pd.read_csv('data/new/SPECT.train',header=None)
spect.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,1,0,0,0,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,0,0,1,1,...,1,1,0,0,0,0,0,0,0,1
2,1,1,0,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
4,1,0,0,0,0,0,0,0,1,0,...,1,0,1,1,0,0,0,0,0,0


In [134]:
for col in spect.columns:
    print col,len(np.unique(sorted(spect[col])))

0 2
1 2
2 2
3 2
4 2
5 2
6 2
7 2
8 2
9 2
10 2
11 2
12 2
13 2
14 2
15 2
16 2
17 2
18 2
19 2
20 2
21 2
22 2


In [146]:
D = wdbcData = pd.read_csv('data/Categorical/abalone.data',header=None)
for col in D.columns:
    print col,len(np.unique(sorted(D[col])))

0 3
1 134
2 111
3 51
4 2429
5 1515
6 880
7 926
8 28


In [147]:
D.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [156]:
tmp = D[(D[8]>3) & (D[8]<18)]

In [207]:
def abF(x):
    if x=='F':
        return 0
    if x=='I':
        return 1
    return 2
def loadAbalone():
    abalone = pd.read_csv('data/Categorical/abalone.data',header=None)
    abaloneNum = abalone.drop([0,8],axis=1).values
    abalone.loc[:, 0] = abalone[0].apply(abF)
    abaloneCat = abalone[[0]].values
    abaloneY = abalone[8].values
    return abaloneCat,abaloneNum,abaloneY-4

In [208]:
xc,xn,y = loadAbalone()

In [209]:
xn.shape

(4177L, 7L)

In [210]:
np.unique(xc)

array([0, 1, 2], dtype=int64)

In [213]:
# for i in xrange(10):
#     mn,st = unsupervised10Fold(xc,xn,y,30,3)
#     print mn,st
# 0.209705345771 0.0231908034737
# 0.220973460466 0.0215412547646
# 0.226713366149 0.0164533945549
# 0.217387812238 0.0226175162089
# 0.219538053768 0.018206412879
# 0.208280839443 0.0179660953986
# 0.21379585327 0.0101162809606

In [216]:
x

array([[ 67.,   1.,   2., ...,   3.,   2.,   2.],
       [ 63.,   1.,   1., ...,   0.,   1.,   1.],
       [ 67.,   1.,   2., ...,   2.,   3.,   2.],
       ..., 
       [ 49.,   1.,   3., ...,   3.,   2.,   2.],
       [ 74.,   2.,   4., ...,   1.,   2.,   1.],
       [ 54.,   2.,   3., ...,   1.,   2.,   1.]])

In [217]:
x.shape

(302L, 14L)

In [234]:
for col in heartCat.columns:
    print col,np.unique(sorted(heartCat[col]))

1 [ 0.  1.]
2 [ 0.  1.  2.  3.]
5 [ 0.  1.]
6 [ 0.  1.  2.]
8 [ 0.  1.]
10 [ 0.  1.  2.]
12 [0 1 2]


In [250]:

approval = pd.read_csv('data/Categorical/Credit approval.data',header=None)
categric = [0,3,4,5,6,7,8,9,10,11,12]
numeric = [1,2,7,10,13,14]
approval.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [287]:
xc,xn,y = loadGerman()

In [289]:
len(y),y.sum()

(1000, 300)

In [275]:
german = loadGerman()
german.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,6,4,4,1169,4,4,3,2,0,...,0,67,2,1,1,2,0,1,0,1
1,1,48,2,4,5951,0,2,1,1,0,...,0,22,2,1,0,2,0,0,0,2
2,3,12,4,7,2096,0,3,1,2,0,...,0,49,2,1,0,1,1,0,0,1
3,0,42,2,3,7882,0,3,1,2,2,...,1,45,2,2,0,2,1,0,0,1
4,0,24,3,0,4870,0,2,2,2,0,...,3,53,2,2,1,2,1,0,0,2


In [269]:
german.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0,6,4,4,1169,4,4,3,2,0,...,0,67,2,1,1,2,0,1,0,1
1,1,48,2,4,5951,0,2,1,1,0,...,0,22,2,1,0,2,0,0,0,2
2,3,12,4,7,2096,0,3,1,2,0,...,0,49,2,1,0,1,1,0,0,1
3,0,42,2,3,7882,0,3,1,2,2,...,1,45,2,2,0,2,1,0,0,1
4,0,24,3,0,4870,0,2,2,2,0,...,3,53,2,2,1,2,1,0,0,2


In [266]:
for col in numericG:
#     print (approval[col]==np.nan).sum()
    print col,':'
    print np.unique(sorted(german[col]))

1 :
[ 4  5  6  7  8  9 10 11 12 13 14 15 16 18 20 21 22 24 26 27 28 30 33 36 39
 40 42 45 47 48 54 60 72]
4 :
[  250   276   338   339   343   362   368   385   392   409   426   428
   433   448   454   458   484   518   522   571   585   590   601   609
   618   625   626   629   639   640   652   654   660   662   666   672
   674   682   683   684   685   691   697   700   701   707   708   709
   717   719   727   730   731   741   745   750   753   754   759   760
   763   766   776   781   783   790   795   797   802   804   806   836
   841   846   860   866   874   882   884   886   888   894   900   902
   907   909   915   918   926   929   930   931   932   936   937   939
   947   950   951   958   959   960   975   976   983   996   999  1007
  1024  1028  1037  1038  1042  1047  1048  1049  1050  1053  1055  1056
  1068  1076  1082  1092  1098  1101  1103  1107  1108  1113  1123  1126
  1131  1136  1138  1149  1154  1155  1158  1163  1164  1168  1169  1175
  1185  1188  

In [290]:
soybean = pd.read_csv('data/Categorical/soyBean.data',header=None)
soybean.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,diaporthe-stem-canker,6,0,2,1,0,1,1,1,0,...,0,0,0,4,0,0,0,0,0,0
1,diaporthe-stem-canker,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,diaporthe-stem-canker,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
3,diaporthe-stem-canker,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
4,diaporthe-stem-canker,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0


In [291]:
for col in soybean.columns:
#     print (approval[col]==np.nan).sum()
    print col,':'
    print np.unique(sorted(soybean[col]))

0 :
['alternarialeaf-spot' 'anthracnose' 'bacterial-blight' 'bacterial-pustule'
 'brown-spot' 'brown-stem-rot' 'charcoal-rot' 'diaporthe-stem-canker'
 'downy-mildew' 'frog-eye-leaf-spot' 'phyllosticta-leaf-spot'
 'powdery-mildew' 'purple-seed-stain' 'rhizoctonia-root-rot']
1 :
[0 1 2 3 4 5 6]
2 :
[0 1]
3 :
[0 1 2]
4 :
[0 1 2]
5 :
[0 1]
6 :
[0 1 2 3]
7 :
[0 1 2 3]
8 :
[0 1 2]
9 :
[0 1 2]
10 :
[0 1 2]
11 :
[0 1]
12 :
[0 1]
13 :
[0 1 2]
14 :
[0 1 2]
15 :
[0 1 2]
16 :
[0 1]
17 :
[0 1]
18 :
[0 1 2]
19 :
[0 1]
20 :
[0 1]
21 :
[0 1 2 3]
22 :
[0 1 2 3]
23 :
[0 1]
24 :
[0 1]
25 :
[0 1]
26 :
[0 1 2]
27 :
[0 1]
28 :
[0 1 3]
29 :
[0 1 2 4]
30 :
[0 1]
31 :
[0 1]
32 :
[0 1]
33 :
[0 1]
34 :
[0 1]
35 :
[0 1 2]


In [299]:
x,y = loadSoybean()
x

array([[7, 6, 0, ..., 0, 0, 0],
       [7, 4, 0, ..., 0, 0, 0],
       [7, 3, 0, ..., 0, 0, 0],
       ..., 
       [9, 5, 0, ..., 0, 0, 0],
       [9, 5, 0, ..., 0, 0, 0],
       [9, 5, 1, ..., 0, 0, 0]], dtype=int64)

array([ 7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,
        6,  6,  6, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  4,  4,  4,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9

In [None]:

heart = pd.read_csv('data/new/Heart_disease.data',header=None)
numeric = [0,3,4,7,9,11]
nominal = [1,2,5,6,8,10,12]
heartNum = heart[numeric].values
heartCat = heart[nominal].values-1
heartY = heart[13].values
return heartCat,heartNum,heartY-1

In [371]:
def loadHeart2():
    heart = pd.read_csv('data/new/Heart_disease.data',header=None)
    numeric = [0,3,4,7,9,11]
    nominal = [1,2,5,6,8,10,12]
    for col in nominal:
        values = np.unique(sorted(heart[col]))
        if len(values)<=2:
            continue
        for val in values:
            name = str(col)+'_'+str(val)
            heart[name] = heart[col].apply(lambda x: 1 if x==val else 0)
        heart = heart.drop([col],axis=1)
    heartX = heart.drop(13).values
    heartY = heart[13].values-1
    return heartX,heartY

In [372]:
x,y = loadHeart2()

In [373]:
x

array([[  67.,    1.,  160., ...,    0.,    1.,    0.],
       [  63.,    1.,  145., ...,    1.,    0.,    0.],
       [  67.,    1.,  120., ...,    0.,    0.,    1.],
       ..., 
       [  49.,    1.,  118., ...,    0.,    1.,    0.],
       [  74.,    2.,  120., ...,    0.,    1.,    0.],
       [  54.,    2.,  160., ...,    0.,    1.,    0.]])

In [374]:
y

array([1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1,

In [416]:
dermatY

array([1, 0, 2, 0, 2, 1, 4, 2, 3, 3, 0, 1, 1, 0, 2, 3, 1, 0, 2, 4, 5, 1, 4,
       2, 4, 0, 5, 4, 1, 2, 0, 1, 0, 1, 2, 0, 1, 3, 0, 1, 4, 2, 3, 5, 1, 2,
       2, 3, 0, 0, 4, 0, 1, 2, 3, 1, 5, 0, 4, 0, 1, 2, 0, 3, 4, 0, 1, 5, 2,
       4, 3, 1, 1, 0, 2, 4, 0, 1, 1, 1, 4, 0, 0, 2, 0, 3, 1, 1, 4, 0, 2, 3,
       1, 4, 0, 5, 1, 4, 0, 1, 1, 0, 3, 0, 2, 0, 0, 2, 4, 2, 2, 4, 1, 2, 3,
       0, 1, 4, 5, 0, 0, 1, 5, 2, 4, 3, 0, 0, 2, 4, 4, 0, 3, 1, 2, 0, 1, 0,
       0, 2, 2, 2, 1, 4, 3, 1, 1, 0, 0, 0, 4, 2, 1, 2, 1, 1, 3, 1, 2, 5, 1,
       0, 0, 2, 3, 2, 2, 0, 0, 0, 2, 0, 0, 1, 2, 2, 0, 0, 0, 0, 5, 1, 1, 1,
       1, 0, 2, 2, 2, 0, 0, 1, 2, 1, 1, 1, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0,
       0, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 5, 5, 5, 3, 3, 3, 0, 0, 0, 0, 0, 1, 1, 3, 3, 3, 0, 0,
       1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 4, 4, 4, 4, 4, 2, 2, 2, 3, 0, 0, 3,
       3, 3,

In [409]:
for col in dermat.columns:
    print col, np.unique(dermat[col])

0 [0 1 2 3]
1 [0 1 2 3]
2 [0 1 2 3]
3 [0 1 2 3]
4 [0 1 2 3]
5 [0 1 2 3]
6 [0 1 2 3]
7 [0 1 2 3]
8 [0 1 2 3]
9 [0 1 2 3]
10 [0 1]
11 [0 1 2 3]
12 [0 1 2]
13 [0 1 2 3]
14 [0 1 2 3]
15 [0 1 2 3]
16 [0 1 2 3]
17 [0 1 2 3]
18 [0 1 2 3]
19 [0 1 2 3]
20 [0 1 2 3]
21 [0 1 2 3]
22 [0 1 2 3]
23 [0 1 2 3]
24 [0 1 2 3]
25 [0 1 2 3]
26 [0 1 2 3]
27 [0 1 2 3]
28 [0 1 2 3]
29 [0 1 2 3]
30 [0 1 2 3]
31 [0 1 2 3]
32 [0 1 2 3]
33 [ 0  7  8  9 10 12 13 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 55 56 57 58
 60 61 62 63 64 65 67 68 70 75]
34 [1 2 3 4 5 6]


In [None]:
def loadHeart2():
    heart = pd.read_csv('data/new/Heart_disease.data',header=None)
    numeric = [0,3,4,7,9,11]
    nominal = [1,2,5,6,8,10,12]
    for col in nominal:
        values = np.unique(sorted(heart[col]))
        if len(values)<=2:
            continue
        for val in values:
            name = str(col)+'_'+str(val)
            heart[name] = heart[col].apply(lambda x: 1 if x==val else 0)
        heart = heart.drop([col],axis=1)
    heartX = heart.drop(13).values
    heartY = heart[13].values-1
    return heartX,heartY

In [16]:
def loadDermat2():
    dermat = pd.read_csv('data/Categorical/dermatology.data',header=None)
    for col in xrange(33):
        values = np.unique(sorted(dermat[col]))
        if len(values)<=2:
            continue
        for val in values:
            name = str(col)+'_'+str(val)
            dermat[name] = dermat[col].apply(lambda x: 1 if x==val else 0)
        dermat = dermat.drop([col],axis=1)
    
    dermatY = dermat[34].values-1
    dermat = dermat.drop([34],axis=1).values
    return dermat,dermatY

In [12]:
xc,xn,y = loadDermat2()

In [13]:
xc

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ..., 
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
xc.shape

(358L, 128L)

In [431]:
dermat = pd.read_csv('data/Categorical/dermatology.data',header=None)
dermatCat = dermat.drop([33,34],axis=1)
dermatCat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
0,2,2,0,3,0,0,0,0,1,0,...,0,0,0,0,3,0,0,0,1,0
1,3,3,3,2,1,0,0,0,1,1,...,1,0,0,0,0,0,0,0,1,0
2,2,1,2,3,1,3,0,3,0,0,...,0,2,0,2,3,2,0,0,2,3
3,2,2,2,0,0,0,0,0,3,2,...,0,0,3,0,0,0,0,0,3,0
4,2,3,2,2,2,2,0,2,0,0,...,0,2,2,3,2,3,0,0,2,3


In [16]:
dermat = pd.read_csv('data/Categorical/dermatology.data',header=None)
dermatX = dermat.drop([34],axis=1).values
dermatY = dermat[34].values-1
return dermatX,dermatY

In [31]:
cat,num,y = loadDermat()

In [32]:
cat.shape, num.shape

((358L, 33L), (358L, 1L))

In [33]:
np.unique(y)

array([0, 1, 2, 3, 4, 5], dtype=int64)