In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn import utils
from sklearn.metrics import accuracy_score
#from sklearn.metrics import jaccard_score
import random

def Leer_Datos(filename,atrributes):
    data = pd.read_csv(filename, usecols=atrributes) 
    return data

In [2]:
def Normalizar_Datos(data):
    mean_ = data.mean(axis=0)
    std_ = data.std(axis=0) #estandar
    return (data - mean_)/std_

In [3]:
def sigmoid(x):
    return 1.0/ (1 + np.exp(-x))

In [47]:
def calculate_accuracy(X, y, W):
    """ accerted values respect to predict"""
    accerted = 0.0
    num = 5
    for i in range(0, num): #place to 5 can be X.shape[0]
        temp = list(W.values())[len(W)-1]
        temp = int(np.round_(temp[i]))
        if temp == y[i]:
            accerted += 1
    return accerted/num

In [48]:
def X_y(data, name_col):
    X = data.loc[:, data.columns != name_col]
    y = data[name_col]
    return X.values, y.values

def PrepareXandY(training,test):
    train_X = training[:,:-1]
    train_X = np.concatenate((np.ones([train_X.shape[0], 1]), train_X), axis=1)

    test_X = test[:,:-1]
    test_X = np.concatenate((np.ones([test_X.shape[0], 1]), test_X), axis=1)

    train_y = training[:,-1]
    test_y = test[:,-1]

    return train_X, train_y, test_X, test_y

In [49]:
def create_k_folds(data, k):
    np.random.shuffle(data)
    col_class = data[:,-1]
    #col_class = np.array([0,0,1,0,1,0,1,0,0,0])
    num_rows = col_class.shape[0]
    unique, counts = np.unique(col_class, return_counts=True)
    distribution = dict(zip(unique, counts))
    #print(distribution)
    percent_of_first_class = round((distribution[0.0] * 100) /  num_rows)
   
    percent_of_second_class = round((distribution[1.0] * 100) / num_rows)
   
    num_per_fold = round(num_rows/k)
    
    num_first_class_per_fold = round((num_per_fold * percent_of_first_class)/100)
    num_second_class_per_fold = round((num_per_fold * percent_of_second_class)/100)

    list_indices=[]
    num_0s=[]
    num_1s=[]
    
    for i in range(k):
        list_indices.append([])
        num_0s.append(0)
        num_1s.append(0)

    for i in range(num_rows):
    
        if col_class[i] == 0.0:
            for j in range(k):
                if(num_0s[j]<num_first_class_per_fold):
                    num_0s[j]+=1
                    #print(list_indices[j],list_indices[j].count(0))
                    list_indices[j].append(i)
                    break

        else:
            for j in range(k):
                if(num_1s[j]<num_second_class_per_fold):
                    num_1s[j]+=1
                    list_indices[j].append(i)
                    break

    if num_rows%k!=0:
        list_indices[k-1].append(col_class.shape[0]-1)
    
    return list_indices

In [50]:
def create_kfolds(data, k):
    np.random.shuffle(data)
    size_fold = int(data.shape[0] / k)
    _sz_fold = int(data.shape[0] % k)
    data = data[:data.shape[0]-_sz_fold,:]
    kfolds = []
    idx_row = 0
    for i in range(k):
        X, y = X_y(data[idx_row:idx_row+size_fold, :])
        kfolds.append({"X": X, "y" : y})
        idx_row += size_fold
    return kfolds, size_fold

def kfolds_cross_validation(data, k):
    for i in range(0, k):
        temp_test_data = k_folds[i]
        temp_train_data = np.delete(k_folds, i, axis=0)
        temp_train_data = temp_train_data.reshape(-1, temp_test_data.shape[1])
        np.random.shuffle(temp_train_data)

        x_train_set, y_train_set = get_x_y_data(temp_train_data)
        y_train_set = y_train_set.reshape(y_train_set.shape[0])

        x_test_set, y_test_set = get_x_y_data(temp_test_data)
        y_test_set = y_test_set.reshape(y_test_set.shape[0])

In [51]:
def calculate_cost_function(X, y, W):
    m = X.shape[0]
    pred = sigmoid(X)
    print('pred', pred)
    cross_entropy = np.sum( (y * np.log(pred)) + ((1-y) * np.log(1-pred)) )
    #print("1: ",np.dot(y.T, np.log(pred)))
    #print("2: ", np.dot((1-y).T, np.log(1-pred)))
    return (-1/m) * cross_entropy

In [52]:
def dS(D):
    return sigmoid(D)*(1.0-sigmoid(D))

In [53]:
def forward(X, W):
    A = {}
    i=0
    for key, value in W.items():
        sumt = np.matmul(X, value)
        y = sigmoid(sumt)
        A.setdefault(i, y)
        #other way: A['activation'+str(i)]=y
        i = i+1
        
    return A

In [54]:
def backward(X, A, y, W, learning_rate):
    newW = {}
    
    #last layer
    last_output = (list(A.values())[len(A)-1])
    real_error = y - last_output
    #print('real error: ', real_error)
    #delta = last_output * (1-last_output) * (1-last_output)
    delta = real_error * dS(1-last_output) 
    
    for key, value in W.items():
        #print('val: ', value)
        #print('val', value.shape[0], value.shape[1])
        mult = learning_rate * A[key] * delta
        result = value + mult
        #print(result)
        newW.setdefault(key, result)
    return newW

In [55]:
def calculate_gradient(X, y, W):
    m = y.shape[0]  
    return (1/m) * np.matmul(X.T, sigmoid(X, W.values()) - y)

def gradient_descent( X, y, W, nb_iterations, learning_rate):
    cost_history = np.zeros(nb_iterations)
    for i in range(nb_iterations):
        error_epoch = 0.0
        A = {}
        for j in range(0, X.shape[0]):
                A = forward(X[j], W)
                last_errors = np.divide(np.power(y - list(W.values())[len(W)-1], 2), 2)
                error_epoch += np.sum(last_errors)
                backward(X[j], A, y[j], W, learning_rate)
        #cost_history[i] = calculate_cost_function( X, y, W)
    return W, cost_history
                
def _gradient_descent( X, y, W, nb_iterations, learning_rate):
    '''Return the final theta vector and array of cost history over nro of iterations
    nb_iterations: or epochs
    '''
    m = y.shape[0]
    cost_history = np.zeros(nb_iterations)
    A = {}
    for i in range(nb_iterations):
        #prediction = X.dot(W.values())
        
        print('W.values()', W.values()[i])
        prediction = np.matmul(X,W.values()[i])
        print('prediction ',prediction.shape)
        print('y ', y.shape)
        
        if W.values()[len(W)-1]:
            prediction = np.matmul(X,W.values()[i])
            print(prediction - y)
            ro = (X.dot(prediction - y))
            print('ro ', ro)
            
            r = ((1/m) * learning_rate * (X.T.dot(prediction - y)))
        print('r ', r)
        W = W.values() - r
        #cost_history[i] = calculate_cost_function( X, y, W) 
    return W, cost_history   

In [56]:
def optional_create_training_test(data):
    np.random.shuffle(data)
    col = data.shape[1]-1
    k = len(data)
    X_train = data[:int((60* k) / 100), :col]
    y_train = data[:int((60* k) / 100), col]
    X_test = data[int((60* k) / 100):, :col]
    y_test = data[int((60* k) / 100):, col]
        
    return X_train, y_train, X_test, y_test

def create_training_test(data):
    num_rows = data.shape[0]
    train_percentage = 0.6
    row_split_data = int(num_rows * train_percentage)
    training, test = data[:row_split_data, :], data[row_split_data:, :]
    return training, test

In [57]:
def create_W(X, nb_neuron):
    return np.random.rand(X.shape[1], nb_neuron)

### EXPERIMENT I

In [75]:
def experimentI():
    files = [("titanic_test.csv",["Sex","Age","Fare","Embarked"]), 
              ("gender_submission.csv",["Survived"]),
              ("titanic_train.csv",["Sex","Age","Fare","Embarked","Survived"])]
    
    f_name = files[0][0]
    atrributes = files[0][1]
    k = 3
    data = Leer_Datos(f_name,atrributes)
    data = data.replace(to_replace='female',value=1,regex=True)
    data = data.replace(to_replace='male',value=0,regex=True)
    data = data.replace(to_replace='C',value=0,regex=True)
    data = data.replace(to_replace='S',value=1,regex=True)
    data = data.replace(to_replace='Q',value=2,regex=True)

    f_name = files[1][0]
    atrributes = files[1][1]
    data2 = Leer_Datos(f_name,atrributes)

    data = np.concatenate((data, data2), axis=1)

    f_name = files[2][0]
    atrributes = files[2][1]
    data3 = Leer_Datos(f_name,atrributes)

    data3 = data3[["Sex","Age","Fare","Embarked","Survived"]]

    data3 = data3.replace(to_replace='female',value=1,regex=True)
    data3 = data3.replace(to_replace='male',value=0,regex=True)
    data3 = data3.replace(to_replace='C',value=0,regex=True)
    data3 = data3.replace(to_replace='S',value=1,regex=True)
    data3 = data3.replace(to_replace='Q',value=2,regex=True)

    data = np.concatenate((data, data3), axis=0)
    data = data[~np.isnan(data).any(axis=1)]
    data[:,1:3] = Normalizar_Datos(data[:,1:3])
    indices = create_k_folds(data, k)
    
    fold1 = data[indices[0]]
    fold2 = data[indices[1]]
    fold3 = data[indices[2]]
    
    learning_rate = 0.01
    epochs = 100 #nb_iterations
    hidde_layers = [1, 2, 3]
    nb_neurons = [5, 6, 7]
    k = 3
    
    training1 = np.concatenate((fold1,fold2))
    training2 = np.concatenate((fold3,fold2))
    training3 = np.concatenate((fold3,fold1))
        
    train_X1, train_y1, test_X1, test_y1 = PrepareXandY(training1,fold3)
    train_X2, train_y2, test_X2, test_y2 = PrepareXandY(training2,fold1)
    train_X3, train_y3, test_X3, test_y3 = PrepareXandY(training3,fold2)
    
    result_tb = [hidde_layers]
    for layer in hidde_layers:
            W1 = {}
            W2 = {}
            W3 = {}
            hlayers = []
            for nb_neuron in nb_neurons:
                accuracy_test_total = 0.0
                countw = 0
                
                theta_test1 = np.zeros(train_X1.shape[1])
                theta_test2 = np.zeros(train_X2.shape[1])
                theta_test3 = np.zeros(train_X3.shape[1])
                
                for i in range(layer):
                        W1.setdefault(countw, create_W(train_X1, nb_neuron))
                        W2.setdefault(countw, create_W(train_X2, nb_neuron))
                        W3.setdefault(countw, create_W(train_X3, nb_neuron))
                        countw = countw + 1
                #capa salida
                W1.setdefault(countw, create_W(train_X1, 1))
                W2.setdefault(countw, create_W(train_X2, 1))
                W3.setdefault(countw, create_W(train_X3, 1))
                countw = countw + 1
                W1, cost_history1 = gradient_descent(train_X1, train_y1, W1, epochs, learning_rate)
                W2, cost_history1 = gradient_descent(train_X2, train_y2, W2, epochs, learning_rate)
                W3, cost_history1 = gradient_descent(train_X3, train_y3, W3, epochs, learning_rate)
                
                accuracy_test1 = calculate_accuracy(test_X1, test_y1, W1)
                accuracy_test2 = calculate_accuracy(test_X2, test_y2, W2)
                accuracy_test3 = calculate_accuracy(test_X3, test_y3, W3)
                
                accuracy_test_total = (accuracy_test1 + accuracy_test2 + accuracy_test3)/k
                hlayers.append("%.4f" % accuracy_test_total)
            result_tb.append(hlayers)    
    m = np.asarray(result_tb)
    pdObj = pd.DataFrame(m.T[:], columns=['HL|N','5','6','7']) 
    print(pdObj)
experimentI()       

  HL|N       5       6       7
0    1  0.5333  0.5333  0.4667
1    2  0.5333  0.5333  0.4667
2    3  0.5333  0.5333  0.4667


In [74]:
def experimentI():
    files = [("Iris.csv",["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm","Species"])]
    
    f_name = files[0][0]
    atrributes = files[0][1]
    k = 3
    data = Leer_Datos(f_name,atrributes)
    data = data.replace(to_replace='Iris-setosa',value=0,regex=True)
    data = data.replace(to_replace='Iris-versicolor',value=1,regex=True)
    data = data.replace(to_replace='Iris-virginica',value=2,regex=True)

    data = data[~np.isnan(data).any(axis=1)]
    data = data.values
    data[:,0:4] = Normalizar_Datos(data[:,0:4])
    
    indices = create_k_folds(data, k)

    fold1 = data[indices[0]]
    fold2 = data[indices[1]]
    fold3 = data[indices[2]]
    
    learning_rate = 0.00001
    epochs = 100 #nb_iterations
    hidde_layers = [1, 2, 3]
    nb_neurons = [5, 6, 7]
    k = 3
    
    training1 = np.concatenate((fold1,fold2))
    training2 = np.concatenate((fold3,fold2))
    training3 = np.concatenate((fold3,fold1))
        
    train_X1, train_y1, test_X1, test_y1 = PrepareXandY(training1,fold3)
    train_X2, train_y2, test_X2, test_y2 = PrepareXandY(training2,fold1)
    train_X3, train_y3, test_X3, test_y3 = PrepareXandY(training3,fold2)
    
    result_tb = [hidde_layers]
    for layer in hidde_layers:
            W1 = {}
            W2 = {}
            W3 = {}
            hlayers = []
            for nb_neuron in nb_neurons:
                accuracy_test_total = 0.0
                countw = 0
                
                theta_test1 = np.zeros(train_X1.shape[1])
                theta_test2 = np.zeros(train_X2.shape[1])
                theta_test3 = np.zeros(train_X3.shape[1])
                
                for i in range(layer):
                        W1.setdefault(countw, create_W(train_X1, nb_neuron))
                        W2.setdefault(countw, create_W(train_X2, nb_neuron))
                        W3.setdefault(countw, create_W(train_X3, nb_neuron))
                        countw = countw + 1
                #capa salida
                W1.setdefault(countw, create_W(train_X1, 1))
                W2.setdefault(countw, create_W(train_X2, 1))
                W3.setdefault(countw, create_W(train_X3, 1))
                countw = countw + 1
                W1, cost_history1 = gradient_descent(train_X1, train_y1, W1, epochs, learning_rate)
                W2, cost_history1 = gradient_descent(train_X2, train_y2, W2, epochs, learning_rate)
                W3, cost_history1 = gradient_descent(train_X3, train_y3, W3, epochs, learning_rate)
                
                accuracy_test1 = calculate_accuracy(test_X1, test_y1, W1)
                accuracy_test2 = calculate_accuracy(test_X2, test_y2, W2)
                accuracy_test3 = calculate_accuracy(test_X3, test_y3, W3)
                
                accuracy_test_total = (accuracy_test1 + accuracy_test2 + accuracy_test3)/k
                hlayers.append("%.4f" % accuracy_test_total)
            result_tb.append(hlayers)    
    m = np.asarray(result_tb)
    pdObj = pd.DataFrame(m.T[:], columns=['HL|N','5','6','7']) 
    print(pdObj)
experimentI()  

  HL|N       5       6       7
0    1  0.3333  0.2667  0.2667
1    2  0.3333  0.2667  0.2667
2    3  0.3333  0.2667  0.2667


### EXPERIMENT II

In [84]:
#EXPERIMENTO II
#iris>0.9

def experimentII():
    files = ["data/heart.csv", "data/Iris.csv"]
    kind_kernel = ["linear", "poly", "rbf"]
    setC = [3,5,7]
    k = 3
    gamma = 1
    for it_file in files:
        print("Archivo: ", it_file)
        data = read_file(it_file)
        X, y = X_y(data.values)
        data_X = normalize_data(X)
        data = np.concatenate((data_X, y), axis=1)
        kfolds, sz_fold = create_kfolds(data, k)
        y_val = np.unique(y) # values y
        
        result_tb = [setC]
        
        for kind in kind_kernel:
            _setC = []
            for C in setC:
                accuracy = 0.0
                
                for l in range(y_val.shape[0]):
                    for i in range(k):
                        X_train = np.zeros((sz_fold * (k-1), data.shape[1] - 1))
                        y_train = np.zeros((sz_fold * (k-1), 1))
                        X_test = np.zeros((sz_fold, data.shape[1] - 1))
                        y_test = np.zeros((sz_fold, 1))
                        count_sz_fold = 0
                        for j in range(k):
                            if j == i:
                                X_test = kfolds[i]['X']
                                y_test = kfolds[i]['y']
                            else:
                                X_train[count_sz_fold:count_sz_fold+sz_fold, :] = kfolds[j]['X']
                                y_train[count_sz_fold:count_sz_fold+sz_fold, :] = kfolds[j]['y'] == y_val[l]
                                count_sz_fold += sz_fold

                        y_train = np.reshape(y_train, y_train.shape[0])
                        y_test = np.reshape(y_test, y_test.shape[0])

                        X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
                        X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias

                        lab_enc = preprocessing.LabelEncoder()
                        training_scores_encoded = lab_enc.fit_transform(y_train)
                        
                        tlab_enc = preprocessing.LabelEncoder()
                        testing_scores_encoded = tlab_enc.fit_transform(y_test)
                        
                        if kind == 'linear':
                            #LINEAL
                            svm_reg = SVC(kernel= kind, C=C)
                            svm_reg.fit(X_train, training_scores_encoded)
                            accuracy_linear = svm_reg.score(X_test, testing_scores_encoded)
                            accuracy = accuracy_linear
                            #print('accuracy linear: ',accuracy_linear)
                        elif kind == 'poly':
                            #POLINOMIAL
                            svm_reg = SVC(kernel= kind, C=C, degree=3, gamma=gamma)
                            svm_reg.fit(X_train, training_scores_encoded)
                            accuracy_poly = svm_reg.score(X_test, testing_scores_encoded)
                            accuracy = accuracy_poly
                            #print('accuracy polinomial: ',accuracy_poly)
                        else:
                            #GAUSSIANO
                            svm_reg = SVC(kernel= kind, C=C, gamma=gamma)
                            svm_reg.fit(X_train, training_scores_encoded)
                            accuracy_gauss = svm_reg.score(X_test, testing_scores_encoded)
                            accuracy = accuracy_gauss 
                            #print('accuracy gaussiano: ',accuracy_gauss)

                _setC.append("%.4f" % accuracy)
            result_tb.append(_setC)

        m = np.asarray(result_tb)
        pdObj = pd.DataFrame(m.T[:], columns=['C|kernel',"linear", "poly", "rbf"]) 
        print(pdObj)          
experimentII()         

('Archivo: ', 'data/heart.csv')
  C|kernel  linear    poly     rbf
0        3  0.7327  0.8020  0.7030
1        5  0.7327  0.8020  0.7030
2        7  0.7228  0.8020  0.7030
('Archivo: ', 'data/Iris.csv')
  C|kernel  linear    poly     rbf
0        3  0.3400  0.3600  0.3400
1        5  0.3400  0.3400  0.3400
2        7  0.3400  0.3400  0.3400
