In [None]:
#importing necessary modules
import numpy as np
from sklearn import svm
from sklearn.base import BaseEstimator, ClassifierMixin
import sklearn.preprocessing as skp
import random
import time

In [None]:
# same as defined in "CS15BTECH11044_svm"
list_columns = ["age","workclass","fnlgwt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","target"]
discrete = ["workclass","education","marital_status","occupation","relationship","race","sex","native_country"]
dict_columns = {}
for i,name in enumerate(list_columns):
    dict_columns[name] = i
dict_workclass = {"Private":0, "Self-emp-not-inc":1, "Self-emp-inc":2, "Federal-gov":3, "Local-gov":4, "State-gov":5, "Without-pay":6, "Never-worked":7}
dict_education = {"Bachelors":0, "Some-college":1, "11th":2, "HS-grad":3, "Prof-school":4, "Assoc-acdm":5, "Assoc-voc":6, "9th":7, "7th-8th":8, "12th":9, "Masters":10, "1st-4th":11, "10th":12, "Doctorate":13,"5th-6th":14,"Preschool":15}
dict_marry = {"Married-civ-spouse":0, "Divorced":1, "Never-married":2, "Separated":3, "Widowed":4, "Married-spouse-absent":4, "Married-AF-spouse":5}
dict_occ = {"Tech-support":0, "Craft-repair":1, "Other-service":2, "Sales":3, "Exec-managerial":4, "Prof-specialty":5, "Handlers-cleaners":6, "Machine-op-inspct":7, "Adm-clerical":8, "Farming-fishing":9, "Transport-moving":10, "Priv-house-serv":11, "Protective-serv":12, "Armed-Forces":13}
dict_relation = {"Wife":0, "Own-child":1, "Husband":2, "Not-in-family":3, "Other-relative":4, "Unmarried":5}
dict_race = {"White":0, "Asian-Pac-Islander":1, "Amer-Indian-Eskimo":2, "Other":3, "Black":4}
dict_sex = {"Female":0,"Male":1}
dict_native_country = {"France":0,"United-States":1,"Cambodia":2,"England":3,"Puerto-Rico":4,"Canada":5,"Germany":6,"Outlying-US(Guam-USVI-etc)":7,"India":8,"Japan":9,"Greece":10,"South":11,"China":12,"Cuba":13,"Iran":14,"Honduras":15,"Philippines":16,"Italy":17,"Poland":18,"Jamaica":19,"Vietnam":20,"Mexico":21,"Portugal":22,"Ireland":23,"Dominican-Republic":24,"Laos":25,"Ecuador":26,"Taiwan":27,"Haiti":28,"Columbia":29,"Hungary":30,"Guatemala":31,"Nicaragua":32,"Scotland":33,"Thailand":34,"Yugoslavia":35,"El-Salvador":36,"Trinadad&Tobago":37,"Peru":38, "Hong":39,"Holand-Netherland":40}
dict_data = {"workclass":dict_workclass,"education":dict_education,"marital_status":dict_marry,"occupation":dict_occ,"relationship":dict_relation,"race":dict_race,"sex":dict_sex,"native_country":dict_native_country}


In [None]:
# class which implements the gaussian kernel function
class RBF(object):
    def __init__(self,gamma):
        self.gamma = gamma

    def __call__(self,X,Y=None):
        XX = np.sum(X*X,axis=1)[:,np.newaxis]
        if Y is None:
            Y = X
            YY = np.transpose(XX)
        else:
            YY =  np.sum(Y*Y,axis=1)[np.newaxis,:]
        dist = XX + YY
        print(X.shape,Y.shape)
        dist -= 2*np.dot(X,np.transpose(Y))
        dist =  np.maximum(dist,0)
        return np.exp(-self.gamma*dist)

# Function implementing linear kernel
def linear(X, Y=None):
    """Linear kernel"""
    if Y is None:
        Y = X
    return np.dot(X, np.transpose(Y))

# Function implementing polynomial kernel
class polynomial(object):
    def __init__(self,q):
        self.q = q
    def __call__(self,X,Y=None):
        if Y is None:
            Y = X
        return (np.dot(X, np.transpose(Y))+1)**self.q


## Implementation of Multiple Kernel Method using heuristic methods to determine their weights

In [1]:
# Class which implements the multi-kernel function(convex sum of all the above kernels)
# Also decides the weights heuristically
class MultiKernelheuristic(object):
    # Initializer for the class
    def __init__(self,kernels,X=None):
        self.kernels = kernels
        self.X = X

    # function which calculates gammas heuristically
    def getGammas(self,X,Y):
        nlist = list()
        
        y = np.dot(Y,Y.T)
        for i in self.kernels:
            k = i(X,Y)            
            ai = np.sum(np.multiply(k,y))
            aj = np.sum(np.multiply(k,k))
            nlist.append(float(ai)/np.sqrt(aj))
        c= sum(list(nlist))
        # creating a list of gammas
        b = [float(i)/c for i in nlist]
        return b
    
    #method which is used while calling the class
    def __call__(self, X, Y=None):

        K = 0
        if X is self.X and (Y is X or Y is None):
            if self.flag == 1:
                for gammas, Ki in zip(self.gamma,self.Ks):
                    if gammas > 0.0:
                        K += gammas * Ki
        else:
            if self.flag == 0:
                self.gamma = self.getGammas(X,Y)
                self.flag+=1
            for gammas, kernel in zip(self.gamma,self.kernels):
                if gammas > 0.0:
                    K += gammas *kernel(X,Y)
        return K


## The MultiKernelSVC class is actually refered from the github link given in the assignment description

In [None]:
# class which implements the SVC for MultiKernel (fit() and predict() methods are implemented)
class MultiKernelSVC(BaseEstimator, ClassifierMixin):
    def __init__(self,kernels,p=1,maxit=10,C=1,tol = 1e-5,store_objective=False):
        self.kernels = kernels
        self.p = p
        self.maxit = maxit
        self.C = C
        self.tol = tol
        self.store_objective = store_objective

    def fit(self,X,y,**params):
        p = float(self.p)
        kernels = self.kernels
        C= self.C

        n_kernels = len(self.kernels)

        multi_kernel = MultiKernelheuristic(kernels,X)

        norms = np.empty(n_kernels)
        maxit = self.maxit

        # for it in range(maxit):
        print("Running for iteration")
        svc = svm.SVC(kernel=multi_kernel,C=C)
        print("Now fitting to the model")
        svc.fit(X,y)
        print("Calculating dual_coef_ and support_")
        self._svc = svc

    def predict(self,X):
        return self._svc.predict(X)


In [None]:
# function to map attribute values to the corresponding integers defined in the pre built dictionaries
def encode_target(data,indices):
    data2 = np.copy(data)
    for i in indices:
        for k in range(0,len(data)):
            if data2[k][dict_columns[i]] != '?':
                data2[k][dict_columns[i]] = str(dict_data[i][data2[k][dict_columns[i]]])
            else:
                x = random.randrange(0,len(dict_data[i]))
                data2[k][dict_columns[i]] = str(x)
    for i in range(0,14):
        if list_columns[i] not in indices:
            maxa = 0
            mina = 9999999
            for k in range(0,len(data2)):
                if int(data2[k][i]) > maxa:
                    maxa = int(data[k][i])
                if int(data2[k][i]) < mina:
                    mina = int(data[k][i])
            print("\n")
            for k in range(0,len(data)):
                if int(data2[k][i]) >= mina and int(data2[k][i]) <= (maxa+4*mina)/5:
                    data2[k][i] = "0"
                elif int(data2[k][i]) >(maxa+4*mina)/5 and int(data2[k][i]) <= (2*maxa+3*mina)/5:
                    data2[k][i] = "1"
                elif int(data2[k][i]) >(2*maxa+3*mina)/5 and int(data2[k][i]) <= (3*maxa+2*mina)/5:
                    data2[k][i] = "2"
                elif int(data2[k][i]) >(3*maxa+2*mina)/5 and int(data2[k][i]) <= (4*maxa+1*mina)/5:
                    data2[k][i] = "3"
                elif int(data2[k][i]) >(4*maxa+1*mina)/5 and int(data2[k][i]) <= (5*maxa+0*mina)/5:
                    data2[k][i] = "4"
    return data2

In [None]:
# Function to load data from a file
def getData(filename):
    data = []
    special_data = []
    with open(filename, 'r') as f:
        for line in f.readlines():
            x = line.split(", ")
            for i in [0,2,4,10,11,12,14]:
                x[i] = int(x[i])
            data.append(x)
    data = np.array(data)
    indices = ["workclass","education","marital_status","occupation","relationship","race","sex","native_country"]
    data = encode_target(data,indices)
    return data

In [None]:
# function implementing cross validation split
def cross_validation_split(dataset,n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset)/n_folds)
    # rd.shuffle(dataset_copy)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size :
            if len(dataset_copy) == 0:
                break
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [None]:
# function calculating accuracy  by comparing actual and predicted label list
def getAccuracy(classval,result):
    correct = 0
    if len(classval) != len(result):
        print("ERROR: Unequal Sizes of both arrays")
        print(str(len(classval)), str(len(result)))
        return -0.01
    for i in range(len(classval)):
        if classval[i] == result[i]:
            correct+=1
    return correct / float(len(classval)) * 100.0

In [None]:
# wrapper function which executed what we need
def main():
    data = getData("train.csv")
    data = data.astype(int)

    #Cross_validated training returning data splitted in parts
    folds = cross_validation_split(data,5)
    scores = []
    ii = 1
    for fold in folds:
        print("Fold: "+str(ii))

        # iterating over each fold in a five fold fashion
        # selecting one fold as test data and others are grouped together as training data
        ii+=1
        train_set = list(folds)
        train_set.pop(ii-2)
        train_set = sum(train_set,[])
        train_set=np.array(train_set)
        train_set_values = train_set[:,-1]
        train_set = train_set[:,[i for i in range(0,len(train_set[0])-1)]]
        test_set = list()
        actual = []
        for row in fold:
            row_copy = list(row)
            actual.append(row_copy[-1])
            l = row_copy[0:len(row_copy)-1]
            test_set.append(l)
        C = 0.1
        test_set = np.array(test_set)

        # Using standard scaler to normalise the features
        scaler = skp.StandardScaler()
        train_set = scaler.fit_transform(train_set)
        test_set = scaler.fit_transform(test_set)

        # creating the multikernel
        kernels = []
        kernels.append(RBF(0.6))
        kernels.append(linear)
        kernels.append(polynomial(2))
        
        #training and predicting the labels as done by MultiKernelSVC
        start = time.time()
        clf = MultiKernelSVC(kernels=kernels,C=C,maxit=10,tol=1e-5,p=1,store_objective=True)
        clf.fit(train_set,train_set_values)
        end = time.time()
        Z = clf.predict(test_set)
        
        #calculating accuracy
        accuracy = getAccuracy(actual,Z)
        print("Accuracy: "+str(accuracy))
        print("Time: "+str(end-start)+" seconds")

In [None]:
main()

#### Analysis of multikernel-heuristic is in CS15BTECH11044_svm notebook