In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn import utils
from sklearn.metrics import accuracy_score
#from sklearn.metrics import jaccard_score
import random

def read_file(filename):
    return pd.read_csv(filename)

In [16]:
def normalize_data(data):
    data = data.astype('float64')
    mean = np.mean(data)
    st = np.std(data)
    return (data - mean)/st   

In [17]:
def sigmoid(x):
    return 1.0/ (1 + np.exp(-x))

In [18]:
def calculate_accuracy(X, y, W):
    """ accerted values respect to predict"""
    accerted = 0.0
    num = 14
    total = num
    for i in range(0, num): #place to 14 can be X.shape[0]
        temp = W.values()[len(W)-1]
        temp = int(np.round_(temp[i]))
        if temp == y[i]:
            accerted += 1
    return accerted/total

In [19]:
def X_y(data):
    col = data.shape[1]-1
    x = data[:, :col]
    y = data[:, col:]
    return x, y

In [6]:
def create_kfolds(data, k):
    np.random.shuffle(data)
    size_fold = int(data.shape[0] / k)
    _sz_fold = int(data.shape[0] % k)
    data = data[:data.shape[0]-_sz_fold,:]
    kfolds = []
    idx_row = 0
    for i in range(k):
        X, y = X_y(data[idx_row:idx_row+size_fold, :])
        kfolds.append({"X": X, "y" : y})
        idx_row += size_fold
    return kfolds, size_fold

def kfolds_cross_validation(data, k):
    for i in range(0, k):
        temp_test_data = k_folds[i]
        temp_train_data = np.delete(k_folds, i, axis=0)
        temp_train_data = temp_train_data.reshape(-1, temp_test_data.shape[1])
        np.random.shuffle(temp_train_data)

        x_train_set, y_train_set = get_x_y_data(temp_train_data)
        y_train_set = y_train_set.reshape(y_train_set.shape[0])

        x_test_set, y_test_set = get_x_y_data(temp_test_data)
        y_test_set = y_test_set.reshape(y_test_set.shape[0])

In [7]:
def calculate_cost_function(X, y, W):
    m = X.shape[0]
    pred = sigmoid(X)
    print('pred', pred)
    cross_entropy = np.sum( (y * np.log(pred)) + ((1-y) * np.log(1-pred)) )
    #print("1: ",np.dot(y.T, np.log(pred)))
    #print("2: ", np.dot((1-y).T, np.log(1-pred)))
    return (-1/m) * cross_entropy

In [8]:
def dS(D):
    return sigmoid(D)*(1.0-sigmoid(D))

In [9]:
def forward(X, W):
    A = {}
    i=0
    for key, value in W.items():
        sumt = np.matmul(X, value)
        y = sigmoid(sumt)
        A.setdefault(i, y)
        #other way: A['activation'+str(i)]=y
        i = i+1
        
    return A

In [10]:
def backward(X, A, y, W, learning_rate):
    newW = {}
    
    #last layer
    last_output = (A.values()[len(A)-1])
    real_error = y - last_output
    #print('real error: ', real_error)
    #delta = last_output * (1-last_output) * (1-last_output)
    delta = real_error * dS(1-last_output) 
    
    for key, value in W.items():
        #print('val: ', value)
        #print('val', value.shape[0], value.shape[1])
        mult = learning_rate * A[key] * delta
        result = value + mult
        #print(result)
        newW.setdefault(key, result)
    return newW

In [11]:
def calculate_gradient(X, y, W):
    m = y.shape[0]  
    return (1/m) * np.matmul(X.T, sigmoid(X, W.values()) - y)

def gradient_descent( X, y, W, nb_iterations, learning_rate):
    cost_history = np.zeros(nb_iterations)
    for i in range(nb_iterations):
        error_epoch = 0.0
        A = {}
        for j in range(0, X.shape[0]):
                A = forward(X[j], W)
                last_errors = np.divide(np.power(y - W.values()[len(W)-1], 2), 2)
                error_epoch += np.sum(last_errors)
                backward(X[j], A, y[j], W, learning_rate)
        #cost_history[i] = calculate_cost_function( X, y, W)
    return W, cost_history
                
def _gradient_descent( X, y, W, nb_iterations, learning_rate):
    '''Return the final theta vector and array of cost history over nro of iterations
    nb_iterations: or epochs
    '''
    m = y.shape[0]
    cost_history = np.zeros(nb_iterations)
    A = {}
    for i in range(nb_iterations):
        #prediction = X.dot(W.values())
        
        print('W.values()', W.values()[i])
        prediction = np.matmul(X,W.values()[i])
        print('prediction ',prediction.shape)
        print('y ', y.shape)
        
        if W.values()[len(W)-1]:
            prediction = np.matmul(X,W.values()[i])
            print(prediction - y)
            ro = (X.dot(prediction - y))
            print('ro ', ro)
            
            r = ((1/m) * learning_rate * (X.T.dot(prediction - y)))
        print('r ', r)
        W = W.values() - r
        #cost_history[i] = calculate_cost_function( X, y, W) 
    return W, cost_history   

In [12]:
def optional_create_training_test(data):
    np.random.shuffle(data)
    col = data.shape[1]-1
    k = len(data)
    X_train = data[:int((60* k) / 100), :col]
    y_train = data[:int((60* k) / 100), col]
    X_test = data[int((60* k) / 100):, :col]
    y_test = data[int((60* k) / 100):, col]
        
    return X_train, y_train, X_test, y_test

def create_training_test(data):
    num_rows = data.shape[0]
    train_percentage = 0.6
    row_split_data = int(num_rows * train_percentage)
    training, test = data[:row_split_data, :], data[row_split_data:, :]
    return training, test

In [13]:
def create_W(X, nb_neuron):
    return np.random.rand(X.shape[1], nb_neuron)

### EXPERIMENT I

In [36]:
def experimentI():
    files = ["Iris.csv"]
    learning_rate = 0.2
    epochs = 100 #nb_iterations
    hidde_layers = [1, 2, 3]
    nb_neurons = [5, 6, 7]
    k = 3
    
    for it_file in files:
        print("Archivo: ", it_file)
        result_tb = [hidde_layers]
        data = read_file(it_file)
        X, y = X_y(data.values)
        data_X = normalize_data(X)
        data = np.concatenate((data_X, y), axis=1)
        kfolds, sz_fold = create_kfolds(data, k)
        
            
        for layer in hidde_layers:
            W = {}
            hlayers = []
            for nb_neuron in nb_neurons:
                accuracy_test_total = 0.0
                countw = 0
                for i in range(k):
                    X_train = np.zeros((sz_fold * (k-1), data.shape[1] - 1))
                    y_train = np.zeros((sz_fold * (k-1), 1))
                    X_test = np.zeros((sz_fold, data.shape[1] - 1))
                    y_test = np.zeros((sz_fold, 1))
                    count_sz_fold = 0
                    for j in range(k):
                        if j == i:
                            X_test = kfolds[i]['X']
                            y_test = kfolds[i]['y']
                        else:
                            X_train[count_sz_fold:count_sz_fold+sz_fold, :] = kfolds[j]['X']
                            y_train[count_sz_fold:count_sz_fold+sz_fold, :] = kfolds[j]['y']
                            count_sz_fold += sz_fold

                    y_train = np.reshape(y_train, y_train.shape[0])
                    y_test = np.reshape(y_test, y_test.shape[0])

                    X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
                    X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias
                    
                    for i in range(layer):
                        W.setdefault(countw, create_W(X_train, nb_neuron))
                        countw = countw + 1
                    #capa salida
                    W.setdefault(countw, create_W(X_train, 1))
                    countw = countw + 1
                    
                    W, cost_history = gradient_descent(X_train, y_train, W, epochs, learning_rate)
                    accuracy_test = calculate_accuracy(X_test, y_test, W)
                    accuracy_test_total += accuracy_test
                    
                accuracy_test_total /= k
                hlayers.append("%.4f" % accuracy_test_total)
            result_tb.append(hlayers)

        m = np.asarray(result_tb)
        pdObj = pd.DataFrame(m.T[:], columns=['HL|N','5','6','7']) 
        print(pdObj)

experimentI()

Archivo:  Iris.csv


ValueError: could not convert string to float: 'Iris-setosa'

### EXPERIMENT II

In [31]:
#EXPERIMENTO II
#iris>0.9

def experimentII():
    files = ["data/heart.csv", "data/Iris.csv"]
    kind_kernel = ["linear", "poly", "rbf"]
    setC = [3,5,7]
    k = 3
    gamma = 1
    for it_file in files:
        print("Archivo: ", it_file)
        data = read_file(it_file)
        X, y = X_y(data.values)
        data_X = normalize_data(X)
        data = np.concatenate((data_X, y), axis=1)
        kfolds, sz_fold = create_kfolds(data, k)
        y_val = np.unique(y) # values y
        
        result_tb = [setC]
        
        for kind in kind_kernel:
            _setC = []
            for C in setC:
                accuracy = 0.0
                
                for l in range(y_val.shape[0]):
                    for i in range(k):
                        X_train = np.zeros((sz_fold * (k-1), data.shape[1] - 1))
                        y_train = np.zeros((sz_fold * (k-1), 1))
                        X_test = np.zeros((sz_fold, data.shape[1] - 1))
                        y_test = np.zeros((sz_fold, 1))
                        count_sz_fold = 0
                        for j in range(k):
                            if j == i:
                                X_test = kfolds[i]['X']
                                y_test = kfolds[i]['y']
                            else:
                                X_train[count_sz_fold:count_sz_fold+sz_fold, :] = kfolds[j]['X']
                                y_train[count_sz_fold:count_sz_fold+sz_fold, :] = kfolds[j]['y'] == y_val[l]
                                count_sz_fold += sz_fold

                        y_train = np.reshape(y_train, y_train.shape[0])
                        y_test = np.reshape(y_test, y_test.shape[0])

                        X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
                        X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias

                        lab_enc = preprocessing.LabelEncoder()
                        training_scores_encoded = lab_enc.fit_transform(y_train)
                        
                        tlab_enc = preprocessing.LabelEncoder()
                        testing_scores_encoded = tlab_enc.fit_transform(y_test)
                        
                        if kind == 'linear':
                            #LINEAL
                            svm_reg = SVC(kernel= kind, C=C)
                            svm_reg.fit(X_train, training_scores_encoded)
                            accuracy_linear = svm_reg.score(X_test, testing_scores_encoded)
                            accuracy = accuracy_linear
                            #print('accuracy linear: ',accuracy_linear)
                        elif kind == 'poly':
                            #POLINOMIAL
                            svm_reg = SVC(kernel= kind, C=C, degree=3, gamma=gamma)
                            svm_reg.fit(X_train, training_scores_encoded)
                            accuracy_poly = svm_reg.score(X_test, testing_scores_encoded)
                            accuracy = accuracy_poly
                            #print('accuracy polinomial: ',accuracy_poly)
                        else:
                            #GAUSSIANO
                            svm_reg = SVC(kernel= kind, C=C, gamma=gamma)
                            svm_reg.fit(X_train, training_scores_encoded)
                            accuracy_gauss = svm_reg.score(X_test, testing_scores_encoded)
                            accuracy = accuracy_gauss 
                            #print('accuracy gaussiano: ',accuracy_gauss)

                _setC.append("%.4f" % accuracy)
            result_tb.append(_setC)

        m = np.asarray(result_tb)
        pdObj = pd.DataFrame(m.T[:], columns=['C|kernel',"linear", "poly", "rbf"]) 
        print(pdObj)          
experimentII()         

Archivo:  data/heart.csv


FileNotFoundError: [Errno 2] File b'data/heart.csv' does not exist: b'data/heart.csv'