In [156]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Pasos
- Cargar datos 
- Normalizar datos 
- Agregar una columna de unos
- Calcular la prediccion (theta tanspuesta por X vector) 
- Calcular el costo (error)
- Dos formas para calcular los parámetros (thetas)
    - Ecuacion normal (X entrenaminto * producto matricil-< inversa - Xt entrenamiento Y(años que vivere))
    - Gradiente Descendiente 

In [157]:
def Leer_Datos(filename):
    pd = pd.read_csv(filename, delim_whitespace=True)
    return np.array(pd)

In [158]:
def Normalizar_Datos(data):
    mean_ = np.mean(data) 
    std_ = np.std(data) #data.std(axis=0) #estandar
    return (data - mean_)/std_

In [159]:
def Sigmoidal(theta,X):
    return 1 / (1 + np.exp( -np.dot(X,theta) ) )        
    #return 1 / (1 + np.exp( -np.dot(X,theta.T) ) )    

In [160]:
def train_test(data): 
    porcentage = 0.70
    np.random.shuffle(data)
    rows = int(porcentage * len(data))
    #int((70*len(data))/100)
    train = data[:rows, :]
    test = data[rows:, :]
    #X_train = data[:rows, :col]
    #y_train = data[:rows, col]
    #X_test = data[rows:, :col]
    #y_test = data[rows:, col]
    return train, test

In [161]:
# Add a column of ones (bias)
def add_ones(X_train, y_train, X_test, y_test):
    n_exa_train = len(y_train)
    n_exa_test = len(y_test)

    X_train = np.concatenate((np.ones([n_exa_train, 1]), X_train), axis=1)
    X_test = np.concatenate((np.ones([n_exa_test, 1]), X_test), axis=1)
    
    return X_train, X_test

In [162]:
def calcular_funcion_costo(X,y, theta):
    m = y.shape[0]
    predictions = Sigmoidal(theta,X)
    error = (y * np.log(predictions)) - ((1-y) * np.log(1-predictions))
    #error = (y.dot(np.log(predictions))) + ((1-y) * np.log(1-predictions))
    return -1/m * (np.sum(error))

In [163]:
def gradient_descent(X, y, theta, nro_iter, learning_rate): 
    m = X.shape[0] #nbr of training data
    #cost_history = np.empty(nro_iter, dtype=float) 
    cost_history = np.zeros(nro_iter) 
    for i in range(nro_iter):
        pred = Sigmoidal(theta, X)
        pred = pred - y   
        cost_history[i] = calcular_funcion_costo( X, y, theta) 
        theta = theta - (learning_rate * ((np.matmul(X.T,pred))/m))
        #theta = theta - (learning_rate * ((np.matmul(pred,X))/m))
    return theta, cost_history

#### EXPERIMENTO III

One vs One

In [185]:
def create_k_folds(data, k):
    name_col = data.columns[data.shape[1]-1]
    cant_x_class = data[name_col].value_counts()
    
    data = np.array(data) 
    #X, y = divide_X_y(data)
    i = 0   
    k_fold = []
    for c in cant_x_class:
        while ( c%k != 0 ):
            c=c-1
        #k_data_X.append(np.split(X[i:c+i, :], k))
        #k_data_y.append(np.split(y[i:c+i, :], k))
        
        k_data = np.split(data[i:c+i, :], k)
        X, y = divide_X_y(data)
        k_fold.append({"X": X, "y" : y})
        i = c+i
    #return k_data_X, k_data_y    
    return k_fold

In [165]:
def accuracy(X, y, theta):
    predict = Sigmoidal(theta, X)
    #print("predict: ",predict)
    probab_threshold = 0.5  
    predicted_classes = (predict >= probab_threshold)
    result = np.logical_xor(np.logical_not(predicted_classes), y)
    return np.sum(result) / y.shape[0]

In [166]:
def divide_X_y(data):
    col = data.shape[1]-1
    X = data[:, :col]
    y = data[:, col:]
    return X, y

In [222]:
def multiclass1():
    #data = Leer_Datos('Iris.csv')
    filename = 'Iris.csv'
    data = pd.read_csv(filename)
    cant_x_class = data[data.columns[data.shape[1]-1]].value_counts()

    learn_rate = 0.1 # DEFAULT
    nb_iterations = 1000
    W_array = []    
    accuracy_array_test = []
    
    data = np.array(data)     
    X, y = divide_X_y(data)
    y_classes = np.unique(y)
    X_normalizada = Normalizar_Datos(X)
    data_normalizada = np.concatenate((X_normalizada, y), axis=1)
    train, test = train_test(data_normalizada) # verificar la division random de train and test
    categories_train = []
    for i in range(0, y_classes.shape[0]):
        category = train[train[:, train.shape[1]-1] == y_classes[i]]
        categories_train.append(category)   
    
    for i in range(0, y_classes.shape[0]):
        for j in range(i+1, y_classes.shape[0]):
            tmp_data = np.concatenate((categories_train[i], categories_train[j]), axis=0)
            #tmp_data = np.concatenate([np.expand_dims(i,axis=0) for i in [y_classes[i],y_classes[j]]])
            
            posiciones_a_cambiar = np.where(tmp_data == y_classes[i]) 
            tmp_data = np.c_[tmp_data[:, :tmp_data.shape[1]-1], np.zeros(tmp_data.shape[0])]
            tmp_data[posiciones_a_cambiar] = 1
            np.random.shuffle(tmp_data)
            
            set_train, set_test = train_test(tmp_data)
            X_train, y_train = divide_X_y(set_train)
            X_test, y_test = divide_X_y(set_test)
            y_train = np.reshape(y_train, y_train.shape[0])
            y_test = np.reshape(y_test, y_test.shape[0])
        
            X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
            X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias
        
            theta = np.random.rand(X_train.shape[1])
            X_train = X_train.astype(float)
            y_train = y_train.astype(int)
            W, cost_history = gradient_descent(X_train, y_train, theta, nb_iterations, learn_rate)
            W_array.append(W)
            
            X_test = X_test.astype('float')
            y_test = y_test.astype(int)
            accuracy_test = accuracy(X_test, y_test, W)
            accuracy_array_test.append(accuracy_test)
            
    #print(W_array)
    print('Iris-setosa vs Iris-versicolor, Iris-setosa vs Iris-virginica, Iris-versicolor vs virginica')
    print(accuracy_array_test)
    
multiclass1()

Iris-setosa vs Iris-versicolor, Iris-setosa vs Iris-virginica, Iris-versicolor vs virginica
[1.0, 1.0, 0.9523809523809523]


In [231]:
def multiclass1_cross_validation():
    filename = 'Iris.csv'
    data = pd.read_csv(filename)

    learn_rate = 0.1 # DEFAULT
    nb_iterations = 1000
    W_array = [] 
    accuracy_array_test = []
    k = 3

    X, y = divide_X_y(data.values)
    y_classes = np.unique(y)
    X_normalizada = Normalizar_Datos(X)
    data_normalizada  = np.concatenate((X_normalizada , y), axis=1)
    train, test = train_test(data_normalizada) # verificar la division random de train and test
    categories_train = []
    for i in range(0, y_classes.shape[0]):
        category = train[train[:, train.shape[1]-1] == y_classes[i]]
        categories_train.append(category)  
        
    # k_set_X[0], k_set_y[0] return first element of e/array
    #k_set_X, k_set_y = create_k_folds(pd.DataFrame(data_normalizada), k)    
    #k_set = create_k_folds(pd.DataFrame(data_normalizada), k)
    
    for c in range(y_classes.shape[0]):
        for c1 in range(i+1, y_classes.shape[0]):
            tmp_data = np.concatenate((categories_train[c], categories_train[c1]), axis=0)
            k_set = create_k_folds(pd.DataFrame(tmp_data), k)
            print('...')
            total_accuracy_test = 0
            for i_test in range(0, k): 
                X_test = np.zeros ( (0, np.size(k_set[0]['X'])) )
                y_test = np.zeros ( (0, np.size(k_set[0]['y'])) )
                X_train = np.zeros( (0, (np.size(k_set[0]['X'])) * (y_classes.shape[0]-1)) )
                y_train = np.zeros( (0, (np.size(k_set[0]['y'])) * (y_classes.shape[0]-1)) )
                print(',,,,,,,,,,')
                print(X_train)
                print(y_train)
                for j in range(0, k): 
                    if (i_test == j):
                        X_test = k_set[i_test]['X']
                        y_test = k_set[i_test]['y'] == y_classes[c]
                        #y_train[ np.where( k_set[i_test]['y'] == y_classes[c]) ] = 1
                    else:
                        X_train = k_set[j]['X']
                        y_train = k_set[j]['y'] == y_classes[c]  
                print(',,,,,,,,,,')
                print(X_train)
                print(y_train)
                y_train = np.reshape(y_train, y_train.shape[0])
                y_test = np.reshape(y_test, y_test.shape[0])

                X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
                X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias

                theta = np.random.rand(np.size(X_train[0]))
                X_train = X_train.astype(float)
                y_train = y_train.astype(int)
                W, cost_history = gradient_descent(X_train, y_train, theta, nb_iterations, learn_rate)
                W_array.append(W)

                X_test = X_test.astype('float')
                y_test = y_test.astype(int)
                accuracy_test = accuracy(X_test, y_test, W)
                total_accuracy_test += accuracy_test

            total_accuracy_test = total_accuracy_test / k
            accuracy_array_test.append(accuracy_test)

    print('Iris-setosa, Iris-versicolor, Iris-virginica')
    print(accuracy_array_test)
    
multiclass1_cross_validation()    

Iris-setosa, Iris-versicolor, Iris-virginica
[]


One vs All

In [169]:
def multiclass2():
    filename = 'Iris.csv'
    data = pd.read_csv(filename)

    learn_rate = 0.1 # DEFAULT
    nb_iterations = 1000
    data = data.sample(frac=1)
    k = 3

    X, y = divide_X_y(data.values)
    X_normalizada = Normalizar_Datos(X)
    data_normalizada  = np.concatenate((X_normalizada , y), axis=1)
    train, test = train_test(data_normalizada) 

    W_array = [] 
    y_classes = np.unique(y)
    accuracy_array_test = []

    for c in range(y_classes.shape[0]):  
        tmp_data = np.c_[train[:, :train.shape[1]-1], np.zeros(train.shape[0])]
        tmp_data[np.where(train == y_classes[c])] = 1
        
        set_train, set_test = train_test(tmp_data)
        X_train, y_train = divide_X_y(set_train)
        X_test, y_test = divide_X_y(set_test)
        y_train = np.reshape(y_train, y_train.shape[0])
        y_test = np.reshape(y_test, y_test.shape[0])
        
        X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
        X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias
        
        theta = np.random.rand(X_train.shape[1])
        X_train = X_train.astype(float)
        y_train = y_train.astype(int)
        W, cost_history = gradient_descent(X_train, y_train, theta, nb_iterations, learn_rate)
        W_array.append(W)
            
        X_test = X_test.astype('float')
        y_test = y_test.astype(int)
        accuracy_test = accuracy(X_test, y_test, W)
        accuracy_array_test.append(accuracy_test)
        
    print('Iris-setosa, Iris-versicolor, Iris-virginica')
    print(accuracy_array_test)
    
multiclass2()    

Iris-setosa, Iris-versicolor, Iris-virginica
[1.0, 0.78125, 0.96875]


In [227]:
def multiclass2_cross_validation():
    filename = 'Iris.csv'
    data = pd.read_csv(filename)

    learn_rate = 0.1 # DEFAULT
    nb_iterations = 1000
    data = data.sample(frac=1)
    k = 3

    X, y = divide_X_y(data.values)
    X_normalizada = Normalizar_Datos(X)
    data_normalizada  = np.concatenate((X_normalizada , y), axis=1)
        
    k_set = create_k_folds(pd.DataFrame(data_normalizada), k)
   
    W_array = [] 
    y_classes = np.unique(y)
    accuracy_array_test = []
    
    for c in range(y_classes.shape[0]):
        total_accuracy_test = 0
        for i_test in range(0, k): 
            X_test = np.zeros ( (0, np.size(k_set[0]['X'])) )
            y_test = np.zeros ( (0, np.size(k_set[0]['y'])) )
            X_train = np.zeros( (0, (np.size(k_set[0]['X'])) * (y_classes.shape[0]-1)) )
            y_train = np.zeros( (0, (np.size(k_set[0]['y'])) * (y_classes.shape[0]-1)) )
     
            for j in range(0, k): 
                if (i_test == j):
                    X_test = k_set[i_test]['X']
                    y_test = k_set[i_test]['y'] == y_classes[c]
                    #y_train[ np.where( k_set[i_test]['y'] == y_classes[c]) ] = 1
                else:
                    X_train = k_set[j]['X']
                    y_train = k_set[j]['y'] == y_classes[c]                   

            y_train = np.reshape(y_train, y_train.shape[0])
            y_test = np.reshape(y_test, y_test.shape[0])
            
            X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
            X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias
          
            theta = np.random.rand(np.size(X_train[0]))
            X_train = X_train.astype(float)
            y_train = y_train.astype(int)
            W, cost_history = gradient_descent(X_train, y_train, theta, nb_iterations, learn_rate)
            W_array.append(W)

            X_test = X_test.astype('float')
            y_test = y_test.astype(int)
            accuracy_test = accuracy(X_test, y_test, W)
            total_accuracy_test += accuracy_test
            
        total_accuracy_test = total_accuracy_test / k
        accuracy_array_test.append(accuracy_test)

    print('Iris-setosa, Iris-versicolor, Iris-virginica')
    print(accuracy_array_test)
    
multiclass2_cross_validation()    

Iris-setosa, Iris-versicolor, Iris-virginica
[1.0, 0.7, 0.9533333333333334]


#### EXPERIMENTO I

In [None]:
def GD_find_parameters():
    fdata = load_data('petrol_consumption.csv')
    data = normalization(fdata)
    data = data.values
    
    X_train, y_train, X_test, y_test = train_test(data,data.shape[1]-1)
    X_train, X_test = add_ones(X_train, y_train, X_test, y_test)

    n_features = X_train.shape[1]
    theta = np.zeros(n_features)
    #theta = np.random.rand(n_features)

    epochs = [2,10,50,100] 
    learn_rates = [ 0.1,  0.5 , 1]
    
    result_train = np.empty([len(learn_rates),len(epochs)])
    result_test = np.empty([len(learn_rates),len(epochs)])
    for epoch in epochs:
        for learn_rate in learn_rates:
            theta_gd, cost_history = gradient_descent(X_train, y_train, theta, epoch, learn_rate)
            error_train = calculate_cost(X_train, y_train, theta_gd)
            error_test = calculate_cost(X_test, y_test, theta_gd)

            print("result_train e ", epoch , "lr ", learn_rate,":", error_train)
            
            result_train = error_train
            result_test = error_test
        print("\n")
    #pdObj = pd.DataFrame(result_train, index = learn_rates, columns = epochs) 
    #pdObj1 = pd.DataFrame(result_test, index = learn_rates, columns = epochs) 
    #return pdObj
GD_find_parameters()

In [None]:
def GD_ploteo():
    fdata = load_data('petrol_consumption.csv')
    data = normalization(fdata)
    data = data.values
    X_train, y_train, X_test, y_test = train_test(data,data.shape[1]-1)
    X_train, X_test = add_ones(X_train, y_train, X_test, y_test)

    n_features = X_train.shape[1]
    #theta = np.zeros(n_features)
    theta = np.random.rand(n_features)

    epochs = [1500,1800,11100,114000,11700,12000,12300,12600,12900,13200]  
    learn_rate = 0.0005

    for epoch in epochs:
        theta_gd, cost_history = gradient_descent(X_train, y_train, theta, epoch, learn_rate)
        theta_gd1, cost_history1 = gradient_descent(X_test, y_test, theta, epoch, learn_rate)
        error_train = calculate_cost(X_train, y_train, theta_gd)
        error_test = calculate_cost(X_test, y_test, theta_gd)
        
        fig, ax = plt.subplots()
        plt.plot(range(len(cost_history)), cost_history)
        plt.plot(range(len(cost_history1)), cost_history1)
        plt.title('TRAIN - TEST '+str(epoch), {'fontsize':10})
        print("Weights of gradient_descent - training data: ", theta_gd, "\n")
        print("Weights of gradient_descent - testing data: ", theta_gd1, "\n")
        ax.grid(True)
        plt.show()
GD_ploteo()

In [None]:
def multiclass2():
    filename = 'Iris.csv'
    data = pd.read_csv(filename)

    learn_rate = 0.1
    num_iter = 10000
    data = data.sample(frac=1)
    k = 3

    X, y = divide_X_y(data.values)
    X_normalizada = Normalizar_Datos(X)
    data_normalizada  = np.concatenate((X_normalizada , y), axis=1)
    k_set_X, k_set_y = crear_k_folds(data_normalizada, k)
    #k_folds, size_fold = crear_k_folds(data_normalizada, k)

    W_vec = []
    y_values = np.unique(y)
    acc_test_total_vec = []

    for l in range(y_values.shape[0]):
        acc_test_total = 0.0
        for i in range(k):
            X_train = np.zeros((size_fold * (k-1), norm_data.shape[1] - 1))
            X_test = np.zeros((size_fold, norm_data.shape[1] - 1))
            y_train = np.zeros((size_fold * (k-1), 1))
            y_test = np.zeros((size_fold, 1))

            count_sz_fold = 0
            for j in range(k):
                if j == i:
                    X_test = k_folds[i]['X']
                    y_test = k_folds[i]['y'] == y_values[l]
                else:
                    X_train[count_sz_fold:count_sz_fold+size_fold, :] = k_folds[j]['X']
                    y_train[count_sz_fold:count_sz_fold+size_fold, :] = k_folds[j]['y'] == y_values[l]
                    count_sz_fold += size_fold

            y_train = np.reshape(y_train, y_train.shape[0])
            y_test = np.reshape(y_test, y_test.shape[0])

            X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
            X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias
            W = Crear_Pesos(X_train)
            W, costs = Gradiente_Descendiente(X_train, y_train, W, num_iter, learn_rate)
            X_test = X_test.astype('float')
            acc_test = Calcular_Accuraccy(X_test, y_test, W)
            acc_test_total += acc_test

        acc_test_total /= k
        acc_test_total_vec.append(acc_test_total)

    print("Clasificación multiclase 'uno vs todos'")
    print("Accuracy en los datos de prueba para 'Iris-setosa' 'Iris-versicolor' 'Iris-virginica' respectivamente:")
    print(acc_test_total_vec)
    
multiclass2()    

In [155]:
 category_1 = train[train[:, train.shape[1]-1] == y_classes[0]]
    category_2 = train[train[:, train.shape[1]-1] == y_classes[1]]
    category_3 = train[train[:, train.shape[1]-1] == y_classes[2]]
    s = np.array([category_1, category_2, category_3])
    
    print('s: ', s)

IndentationError: unexpected indent (<ipython-input-155-7acb32f81086>, line 2)