In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn import utils
from sklearn.metrics import accuracy_score
#from sklearn.metrics import jaccard_score
import random

def Leer_Datos(filename,atrributes):
    data = pd.read_csv(filename, usecols=atrributes) 
    return data

In [2]:
def Normalizar_Datos(data):
    mean_ = data.mean(axis=0)
    std_ = data.std(axis=0) #estandar
    return (data - mean_)/std_

In [3]:
def sigmoid(x):
    return 1.0/ (1 + np.exp(-x))

In [4]:
def calculate_accuracy(X, y, W):
    """ accerted values respect to predict"""
    accerted = 0.0
    num = 5
    for i in range(0, num): #place to 5 can be X.shape[0]
        temp = list(W.values())[len(W)-1]
        temp = int(np.round_(temp[i]))
        if temp == y[i]:
            accerted += 1
    return accerted/num

In [5]:
def X_y(data, name_col):
    X = data.loc[:, data.columns != name_col]
    y = data[name_col]
    return X.values, y.values

def PrepareXandY(training,test):
    train_X = training[:,:-1]
    train_X = np.concatenate((np.ones([train_X.shape[0], 1]), train_X), axis=1)

    test_X = test[:,:-1]
    test_X = np.concatenate((np.ones([test_X.shape[0], 1]), test_X), axis=1)

    train_y = training[:,-1]
    test_y = test[:,-1]

    return train_X, train_y, test_X, test_y

In [47]:
def create_k_folds(data, k):
    np.random.shuffle(data)
    col_class = data[:,-1]
    #col_class = np.array([0,0,1,0,1,0,1,2,2,2,0])
    num_rows = col_class.shape[0]
    unique, counts = np.unique(col_class, return_counts=True)
    distribution = dict(zip(unique, counts))
   
    percent_per_class = {}

    classes = []
    for key in distribution:
        percent_per_class[key]=round((distribution[key] * 100) /  num_rows)
        classes.append(key)
   
    num_per_fold = round(num_rows/k)

    num_samples_by_class_per_fold = {}

    for key in percent_per_class:
        num_samples_by_class_per_fold[key]=round((num_per_fold * percent_per_class[key])/100)
    

    list_indices=[]
    count_classes={}

    for key in distribution:
        list_class = []
        for i in range(k):
            list_class.append(0)
        count_classes[key]=list_class
    
    for i in range(k):
        list_indices.append([])

    extra = 0
    for i in range(num_rows):
        added = False
        for key in distribution:
            if col_class[i] == key:
                for j in range(k):
                    if(count_classes[key][j]<num_samples_by_class_per_fold[key]):
                        count_classes[key][j]+=1
                        #print(list_indices[j],list_indices[j].count(0))
                        list_indices[j].append(i)
                        added = True
                        break
                if added:
                    break
        if not added:
            list_indices[extra].append(i)
            extra = (extra+1)%k
    
    return data, list_indices



In [7]:
def create_kfolds(data, k):
    np.random.shuffle(data)
    size_fold = int(data.shape[0] / k)
    _sz_fold = int(data.shape[0] % k)
    data = data[:data.shape[0]-_sz_fold,:]
    kfolds = []
    idx_row = 0
    for i in range(k):
        X, y = X_y(data[idx_row:idx_row+size_fold, :])
        kfolds.append({"X": X, "y" : y})
        idx_row += size_fold
    return kfolds, size_fold

def kfolds_cross_validation(data, k):
    for i in range(0, k):
        temp_test_data = k_folds[i]
        temp_train_data = np.delete(k_folds, i, axis=0)
        temp_train_data = temp_train_data.reshape(-1, temp_test_data.shape[1])
        np.random.shuffle(temp_train_data)

        x_train_set, y_train_set = get_x_y_data(temp_train_data)
        y_train_set = y_train_set.reshape(y_train_set.shape[0])

        x_test_set, y_test_set = get_x_y_data(temp_test_data)
        y_test_set = y_test_set.reshape(y_test_set.shape[0])

In [8]:
def calculate_cost_function(X, y, W):
    m = X.shape[0]
    pred = sigmoid(X)
    print('pred', pred)
    cross_entropy = np.sum( (y * np.log(pred)) + ((1-y) * np.log(1-pred)) )
    #print("1: ",np.dot(y.T, np.log(pred)))
    #print("2: ", np.dot((1-y).T, np.log(1-pred)))
    return (-1/m) * cross_entropy

In [9]:
def dS(D):
    return sigmoid(D)*(1.0-sigmoid(D))

In [10]:
def forward(X, W):
    A = {}
    i=0
    for key, value in W.items():
        sumt = np.matmul(X, value)
        y = sigmoid(sumt)
        A.setdefault(i, y)
        #other way: A['activation'+str(i)]=y
        i = i+1
        
    return A

In [11]:
def backward(X, A, y, W, learning_rate):
    newW = {}
    
    #last layer
    last_output = (list(A.values())[len(A)-1])
    real_error = y - last_output
    #print('real error: ', real_error)
    #delta = last_output * (1-last_output) * (1-last_output)
    delta = real_error * dS(1-last_output) 
    
    for key, value in W.items():
        #print('val: ', value)
        #print('val', value.shape[0], value.shape[1])
        mult = learning_rate * A[key] * delta
        result = value + mult
        #print(result)
        newW.setdefault(key, result)
    return newW

In [12]:
def calculate_gradient(X, y, W):
    m = y.shape[0]  
    return (1/m) * np.matmul(X.T, sigmoid(X, W.values()) - y)

def gradient_descent( X, y, W, nb_iterations, learning_rate):
    cost_history = np.zeros(nb_iterations)
    for i in range(nb_iterations):
        error_epoch = 0.0
        A = {}
        for j in range(0, X.shape[0]):
                A = forward(X[j], W)
                last_errors = np.divide(np.power(y - list(W.values())[len(W)-1], 2), 2)
                error_epoch += np.sum(last_errors)
                backward(X[j], A, y[j], W, learning_rate)
        #cost_history[i] = calculate_cost_function( X, y, W)
    return W, cost_history
                
def _gradient_descent( X, y, W, nb_iterations, learning_rate):
    '''Return the final theta vector and array of cost history over nro of iterations
    nb_iterations: or epochs
    '''
    m = y.shape[0]
    cost_history = np.zeros(nb_iterations)
    A = {}
    for i in range(nb_iterations):
        #prediction = X.dot(W.values())
        
        print('W.values()', W.values()[i])
        prediction = np.matmul(X,W.values()[i])
        print('prediction ',prediction.shape)
        print('y ', y.shape)
        
        if W.values()[len(W)-1]:
            prediction = np.matmul(X,W.values()[i])
            print(prediction - y)
            ro = (X.dot(prediction - y))
            print('ro ', ro)
            
            r = ((1/m) * learning_rate * (X.T.dot(prediction - y)))
        print('r ', r)
        W = W.values() - r
        #cost_history[i] = calculate_cost_function( X, y, W) 
    return W, cost_history   

In [13]:
def optional_create_training_test(data):
    np.random.shuffle(data)
    col = data.shape[1]-1
    k = len(data)
    X_train = data[:int((60* k) / 100), :col]
    y_train = data[:int((60* k) / 100), col]
    X_test = data[int((60* k) / 100):, :col]
    y_test = data[int((60* k) / 100):, col]
        
    return X_train, y_train, X_test, y_test

def create_training_test(data):
    num_rows = data.shape[0]
    train_percentage = 0.6
    row_split_data = int(num_rows * train_percentage)
    training, test = data[:row_split_data, :], data[row_split_data:, :]
    return training, test

In [14]:
def create_W(X, nb_neuron):
    return np.random.rand(X.shape[1], nb_neuron)

In [15]:

def AccSVMfold(fold,model):
    model.fit(fold[0], fold[1])
    pred_linear = model.predict(fold[2])
    return accuracy_score(fold[3],pred_linear)

def getAvgAccSMV(folds,svm_reg):
    accuracy = AccSVMfold(folds[0],svm_reg)
    accuracy1 = AccSVMfold(folds[1],svm_reg)
    accuracy2 = AccSVMfold(folds[2],svm_reg)

    return (accuracy + accuracy1 + accuracy2)/3

In [16]:
def GetTitanicData():
    files = [("../titanic_test.csv",["Sex","Age","Fare","Embarked"]), 
              ("../gender_submission.csv",["Survived"]),
              ("../titanic_train.csv",["Sex","Age","Fare","Embarked","Survived"])]
    
    f_name = files[0][0]
    atrributes = files[0][1]

    data = Leer_Datos(f_name,atrributes)

    data = data.replace(to_replace='female',value=1,regex=True)
    data = data.replace(to_replace='male',value=0,regex=True)
    data = data.replace(to_replace='C',value=0,regex=True)
    data = data.replace(to_replace='S',value=1,regex=True)
    data = data.replace(to_replace='Q',value=2,regex=True)


    f_name = files[1][0]
    atrributes = files[1][1]
    data2 = Leer_Datos(f_name,atrributes)

    data = np.concatenate((data, data2), axis=1)

    f_name = files[2][0]
    atrributes = files[2][1]
    data3 = Leer_Datos(f_name,atrributes)

    data3 = data3[["Sex","Age","Fare","Embarked","Survived"]]

    data3 = data3.replace(to_replace='female',value=1,regex=True)
    data3 = data3.replace(to_replace='male',value=0,regex=True)
    data3 = data3.replace(to_replace='C',value=0,regex=True)
    data3 = data3.replace(to_replace='S',value=1,regex=True)
    data3 = data3.replace(to_replace='Q',value=2,regex=True)

    data = np.concatenate((data, data3), axis=0)
    data = data[~np.isnan(data).any(axis=1)]
    data[:,1:3] = Normalizar_Datos(data[:,1:3])
    return data

def getIrisData():
    files = [("../Iris.csv",["SepalLengthCm","SepalWidthCm","PetalLengthCm","PetalWidthCm","Species"])]
    
    f_name = files[0][0]
    atrributes = files[0][1]
    
    data = Leer_Datos(f_name,atrributes)
    data = data.replace(to_replace='Iris-setosa',value=0,regex=True)
    data = data.replace(to_replace='Iris-versicolor',value=1,regex=True)
    data = data.replace(to_replace='Iris-virginica',value=2,regex=True)

    data = data[~np.isnan(data).any(axis=1)]
    data = data.values
    data[:,0:4] = Normalizar_Datos(data[:,0:4])
    return data

### EXPERIMENT I

In [19]:
def experimentI():
    data = GetTitanicData()
    k = 3
    data, indices = create_k_folds(data, k)
    
    fold1 = data[indices[0]]
    fold2 = data[indices[1]]
    fold3 = data[indices[2]]
    
    learning_rate = 0.01
    epochs = 100 #nb_iterations
    hidde_layers = [1, 2, 3]
    nb_neurons = [5, 6, 7]
    
    training1 = np.concatenate((fold1,fold2))
    training2 = np.concatenate((fold3,fold2))
    training3 = np.concatenate((fold3,fold1))
        
    train_X1, train_y1, test_X1, test_y1 = PrepareXandY(training1,fold3)
    train_X2, train_y2, test_X2, test_y2 = PrepareXandY(training2,fold1)
    train_X3, train_y3, test_X3, test_y3 = PrepareXandY(training3,fold2)
    
    result_tb = [hidde_layers]
    for layer in hidde_layers:
            W1 = {}
            W2 = {}
            W3 = {}
            hlayers = []
            for nb_neuron in nb_neurons:
                accuracy_test_total = 0.0
                countw = 0
                
                theta_test1 = np.zeros(train_X1.shape[1])
                theta_test2 = np.zeros(train_X2.shape[1])
                theta_test3 = np.zeros(train_X3.shape[1])
                
                for i in range(layer):
                        W1.setdefault(countw, create_W(train_X1, nb_neuron))
                        W2.setdefault(countw, create_W(train_X2, nb_neuron))
                        W3.setdefault(countw, create_W(train_X3, nb_neuron))
                        countw = countw + 1
                #capa salida
                W1.setdefault(countw, create_W(train_X1, 1))
                W2.setdefault(countw, create_W(train_X2, 1))
                W3.setdefault(countw, create_W(train_X3, 1))
                countw = countw + 1
                W1, cost_history1 = gradient_descent(train_X1, train_y1, W1, epochs, learning_rate)
                W2, cost_history1 = gradient_descent(train_X2, train_y2, W2, epochs, learning_rate)
                W3, cost_history1 = gradient_descent(train_X3, train_y3, W3, epochs, learning_rate)
                
                accuracy_test1 = calculate_accuracy(test_X1, test_y1, W1)
                accuracy_test2 = calculate_accuracy(test_X2, test_y2, W2)
                accuracy_test3 = calculate_accuracy(test_X3, test_y3, W3)
                
                accuracy_test_total = (accuracy_test1 + accuracy_test2 + accuracy_test3)/k
                hlayers.append("%.4f" % accuracy_test_total)
            result_tb.append(hlayers)    
    m = np.asarray(result_tb)
    pdObj = pd.DataFrame(m.T[:], columns=['HL|N','5','6','7']) 
    print(pdObj)
experimentI()       

UnboundLocalError: local variable 'k' referenced before assignment

In [18]:
def experimentI():
    k = 3
    data=getIrisData()
    
    data, indices = create_k_folds(data, k)

    fold1 = data[indices[0]]
    fold2 = data[indices[1]]
    fold3 = data[indices[2]]
    
    learning_rate = 0.00001
    epochs = 100 #nb_iterations
    hidde_layers = [1, 2, 3]
    nb_neurons = [5, 6, 7]
    k = 3
    
    training1 = np.concatenate((fold1,fold2))
    training2 = np.concatenate((fold3,fold2))
    training3 = np.concatenate((fold3,fold1))
        
    train_X1, train_y1, test_X1, test_y1 = PrepareXandY(training1,fold3)
    train_X2, train_y2, test_X2, test_y2 = PrepareXandY(training2,fold1)
    train_X3, train_y3, test_X3, test_y3 = PrepareXandY(training3,fold2)
    
    result_tb = [hidde_layers]
    for layer in hidde_layers:
            W1 = {}
            W2 = {}
            W3 = {}
            hlayers = []
            for nb_neuron in nb_neurons:
                accuracy_test_total = 0.0
                countw = 0
                
                theta_test1 = np.zeros(train_X1.shape[1])
                theta_test2 = np.zeros(train_X2.shape[1])
                theta_test3 = np.zeros(train_X3.shape[1])
                
                for i in range(layer):
                        W1.setdefault(countw, create_W(train_X1, nb_neuron))
                        W2.setdefault(countw, create_W(train_X2, nb_neuron))
                        W3.setdefault(countw, create_W(train_X3, nb_neuron))
                        countw = countw + 1
                #capa salida
                W1.setdefault(countw, create_W(train_X1, 1))
                W2.setdefault(countw, create_W(train_X2, 1))
                W3.setdefault(countw, create_W(train_X3, 1))
                countw = countw + 1
                W1, cost_history1 = gradient_descent(train_X1, train_y1, W1, epochs, learning_rate)
                W2, cost_history1 = gradient_descent(train_X2, train_y2, W2, epochs, learning_rate)
                W3, cost_history1 = gradient_descent(train_X3, train_y3, W3, epochs, learning_rate)
                
                accuracy_test1 = calculate_accuracy(test_X1, test_y1, W1)
                accuracy_test2 = calculate_accuracy(test_X2, test_y2, W2)
                accuracy_test3 = calculate_accuracy(test_X3, test_y3, W3)
                
                accuracy_test_total = (accuracy_test1 + accuracy_test2 + accuracy_test3)/k
                hlayers.append("%.4f" % accuracy_test_total)
            result_tb.append(hlayers)    
    m = np.asarray(result_tb)
    pdObj = pd.DataFrame(m.T[:], columns=['HL|N','5','6','7']) 
    print(pdObj)
experimentI()  

[['1' '2' '3']
 ['0.3333' '0.3333' '0.3333']
 ['0.2667' '0.2667' '0.2667']
 ['0.4000' '0.4000' '0.4000']]
  HL|N       5       6       7
0    1  0.3333  0.2667  0.4000
1    2  0.3333  0.2667  0.4000
2    3  0.3333  0.2667  0.4000


### EXPERIMENT II

In [52]:

def experimentII(function_data):
    k = 3
    data=function_data()
    data, indices = create_k_folds(data, k)
    
    fold1 = data[indices[0]]
    fold2 = data[indices[1]]
    fold3 = data[indices[2]]

    training1 = np.concatenate((fold1,fold2))
    training2 = np.concatenate((fold3,fold2))
    training3 = np.concatenate((fold3,fold1))
        
    train_X1, train_y1, test_X1, test_y1 = PrepareXandY(training1,fold3)
    train_X2, train_y2, test_X2, test_y2 = PrepareXandY(training2,fold1)
    train_X3, train_y3, test_X3, test_y3 = PrepareXandY(training3,fold2)

    folds = [(train_X1, train_y1, test_X1, test_y1),
    (train_X2, train_y2, test_X2, test_y2),
    (train_X3, train_y3, test_X3, test_y3)]

    kind_kernel = ["linear", "poly", "rbf"]
    setC = [1,3,5,100]
   
    m=[]
    for C in setC:
        row = [C]

        svm_reg_l = SVC(kernel='linear',C=C,gamma=1, decision_function_shape='ovo')
        avg_acc_linear = getAvgAccSMV(folds,svm_reg_l)

        row.append(avg_acc_linear)
            
        
        svm_reg_p = SVC(kernel='poly',C=C,degree=3,gamma=1,decision_function_shape='ovo')
        avg_acc_poly = getAvgAccSMV(folds,svm_reg_p)

        row.append(avg_acc_poly)
            
        svm_reg_g = SVC(kernel='rbf', C=C, gamma=1, decision_function_shape='ovo')
        avg_acc_gauss = getAvgAccSMV(folds,svm_reg_g) 
            
        row.append(avg_acc_gauss)
        m.append(row)

    m = np.asarray(m)
    #pdObj = pd.DataFrame(m.T[:], columns=['num','C','linear','poly','gauss']) 
    print(m)
    
print("TITANIC")               
experimentII(GetTitanicData)

print("IRIS")               
experimentII(getIrisData)

TITANIC
[[  1.           0.84951331   0.84951605   0.83896579]
 [  3.           0.84951331   0.84951605   0.83319373]
 [  5.           0.84951331   0.84951605   0.83126969]
 [100.           0.84951331   0.84951605   0.81402258]]
IRIS
[[  1.           0.96         0.96         0.96666667]
 [  3.           0.96666667   0.94666667   0.96666667]
 [  5.           0.97333333   0.94666667   0.96      ]
 [100.           0.95333333   0.94         0.94666667]]
