In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Pasos
- Cargar datos 
- Normalizar datos 
- Agregar una columna de unos
- Calcular la prediccion (theta tanspuesta por X vector) 
- Calcular el costo (error)
- Dos formas para calcular los parámetros (thetas)
    - Ecuacion normal (X entrenaminto * producto matricil-< inversa - Xt entrenamiento Y(años que vivere))
    - Gradiente Descendiente 

In [2]:
def Leer_Datos(filename,atrributes):
    data = pd.read_csv(filename, usecols=atrributes) 
    return data

In [3]:
def Normalizar_Datos(data):
    mean_ = data.mean(axis=0)
    std_ = data.std(axis=0) #estandar
    return (data - mean_)/std_

In [4]:
def Sigmoidal(theta,X):
    return 1 / (1 + np.exp( -np.dot(X,theta) ) )        
    #return 1 / (1 + np.exp( -np.dot(X,theta.T) ) )    

In [None]:
def train_test(data): 
    porcentage = 0.70
    np.random.shuffle(data)
    rows = int(porcentage * len(data))
    #int((70*len(data))/100)
    train = data[:rows, :]
    test = data[rows:, :]
    #X_train = data[:rows, :col]
    #y_train = data[:rows, col]
    #X_test = data[rows:, :col]
    #y_test = data[rows:, col]
    return train, test

In [7]:
def divide_X_y(data):
    col = data.shape[1]-1
    X = data[:, :col]
    y = data[:, col:]
    return X, y

In [7]:
# Add a column of ones (bias)
def add_ones(X_train, y_train, X_test, y_test):
    n_exa_train = len(y_train)
    n_exa_test = len(y_test)

    X_train = np.concatenate((np.ones([n_exa_train, 1]), X_train), axis=1)
    X_test = np.concatenate((np.ones([n_exa_test, 1]), X_test), axis=1)
    
    return X_train, X_test

In [None]:
def calcular_funcion_costo(X,y, theta):
    m = y.shape[0]
    predictions = Sigmoidal(theta,X)
    error = (y * np.log(predictions)) - ((1-y) * np.log(1-predictions))
    #error = (y.dot(np.log(predictions))) + ((1-y) * np.log(1-predictions))
    return -1/m * (np.sum(error))

In [None]:
def gradient_descent(X, y, theta, nro_iter, learning_rate): 
    m = X.shape[0] #nbr of training data
    #cost_history = np.empty(nro_iter, dtype=float) 
    cost_history = np.zeros(nro_iter) 
    for i in range(nro_iter):
        pred = Sigmoidal(theta, X)
        pred = pred - y   
        cost_history[i] = calcular_funcion_costo( X, y, theta) 
        theta = theta - (learning_rate * ((np.matmul(X.T,pred))/m))
        #theta = theta - (learning_rate * ((np.matmul(pred,X))/m))
    return theta, cost_history

In [10]:
def accuracy(X, y, theta):
    predict = Sigmoidal(theta, X)
    #print("predict: ",predict)
    probab_threshold = 0.5  
    predicted_classes = (predict >= probab_threshold)
    result = np.logical_xor(np.logical_not(predicted_classes), y)
    return np.sum(result) / y.shape[0]

In [41]:
def create_k_folds(data, k):
    np.random.shuffle(data)
    col_class = data[:,-1]
    #col_class = np.array([0,0,1,0,1,0,1,0,0,0])
    num_rows = col_class.shape[0]
    unique, counts = np.unique(col_class, return_counts=True)
    distribution = dict(zip(unique, counts))

    percent_of_first_class = round((distribution[0.0] * 100) /  num_rows)
   
    percent_of_second_class = round((distribution[1.0] * 100) / num_rows)
   
    num_per_fold = round(num_rows/k)
    
    num_first_class_per_fold = round((num_per_fold * percent_of_first_class)/100)
    num_second_class_per_fold = round((num_per_fold * percent_of_second_class)/100)

    list_indices=[]
    num_0s=[]
    num_1s=[]
    
    for i in range(k):
        list_indices.append([])
        num_0s.append(0)
        num_1s.append(0)

    for i in range(num_rows):
    
        if col_class[i] == 0.0:
            for j in range(k):
                if(num_0s[j]<num_first_class_per_fold):
                    num_0s[j]+=1
                    #print(list_indices[j],list_indices[j].count(0))
                    list_indices[j].append(i)
                    break

        else:
            for j in range(k):
                if(num_1s[j]<num_second_class_per_fold):
                    num_1s[j]+=1
                    list_indices[j].append(i)
                    break

    if num_rows%k!=0:
        list_indices[k-1].append(col_class.shape[0]-1)
    
    return list_indices
    
    #print(list_indices[0])
    #print(list_indices[1])
    #print(list_indices[2])

        
    



#### EXPERIMENTO I

In [40]:
def GD_find_parameters():
    files = [("Data_classification/weatherAUS.csv",["MinTemp","MaxTemp","RainToday"],["RainTomorrow"])]
    for f in files:
        f_name = f[0]
        atrributes = f[1]
        class_att = f[2]
        k = 3
        data = Leer_Datos(f_name,atrributes)
        data = data.replace(to_replace='Yes',value=1,regex=True)
        data = data.replace(to_replace='No',value=0,regex=True)

        data = data.values[~np.isnan(data).any(axis=1)]
        data[:,0:2] = Normalizar_Datos(data[:,0:2])
        
        indices = create_k_folds(data, k)
       

        fold1 = data[indices[0]]
        fold2 = data[indices[1]]
        fold3 = data[indices[2]]

        nb_iterations = [500,1000,1500,2000,2500,3000,3500]
        nb_its_label = nb_iterations.copy()
        learning_rate = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4]
        result_tb = [learning_rate]
        print("FILE: ", f)
        
        for nb_it in nb_iterations:
            rlearning_rate = []
            for learn_rate in learning_rate:
                #print("num_iter = ", nb_it)
                #print("learn_rate = ", learn_rate)
                accuracy_total = 0.0
                for i in range(0,len(set_X)):
                    X_train = np.zeros((set_X[i].shape[0], set_X[i].shape[1] - 1))
                    y_train = np.zeros((set_X[i].shape[0], 1))
                    X_test = np.c_[set_X[i], np.ones(set_X[i].shape[0])]        #bias
                    y_test = set_y[i]
                    for t in range(0,k):
                        if t!=i:
                            #print("set_X[t]: ",set_X[t])
                            X_train = np.c_[set_X[t], np.ones(set_X[t].shape[0])]        #bias
                            #print("X_train: ",X_train)
                            y_train = set_y[t]
                            #set_X_train.append(X_train)
                            #set_y_train.append(y_train)
                    theta = create_theta(X_train)
                    theta, cost_history = gradient_descent(X_train, y_train, theta, nb_it, learn_rate)
                    accuracy_train = calculate_accuracy(X_train, y_train, theta)
                    #print("accuracy train: ",accuracy_train) 
                    cost_train = calculate_cost_function(X_train, y_train, theta)
                    #print("Pesos de Gradiente descendiente: ", theta)
                    #print("Cost training: ", cost_train)
                    cost_test = calculate_cost_function(X_test, y_test, theta)
                    accuracy_test = calculate_accuracy(X_test, y_test, theta) 
                
                    accuracy_total += accuracy_test
                accuracy_total /= k
                rlearning_rate.append(accuracy_total)
                #print("Pesos de Gradiente descendiente: ", theta)
                #print("accuracy test: ",accuracy_test) 
                #print("Costo test: ", cost_test, "\n")
            result_tb.append(rlearning_rate)

        m = np.asarray(result_tb)
        pdObj = pd.DataFrame(m.T[:], columns=['TL|It','500','1000','1500','2000','2500','3000','3500']) 
        print(pdObj)
        
GD_find_parameters()

1
(46695, 4)
(46695, 4)
(46462, 4)


#### EXPERIMENTO II

One vs One

In [None]:
def multiclass1():
    #data = Leer_Datos('Iris.csv')
    filename = 'Iris.csv'
    data = pd.read_csv(filename)
    cant_x_class = data[data.columns[data.shape[1]-1]].value_counts()

    learn_rate = 0.1 # DEFAULT
    nb_iterations = 1000
    W_array = []    
    accuracy_array_test = []
    
    data = np.array(data)     
    X, y = divide_X_y(data)
    y_classes = np.unique(y)
    X_normalizada = Normalizar_Datos(X)
    data_normalizada = np.concatenate((X_normalizada, y), axis=1)
    train, test = train_test(data_normalizada) # verificar la division random de train and test
    categories_train = []
    for i in range(0, y_classes.shape[0]):
        category = train[train[:, train.shape[1]-1] == y_classes[i]]
        categories_train.append(category)   
    
    for i in range(0, y_classes.shape[0]):
        for j in range(i+1, y_classes.shape[0]):
            tmp_data = np.concatenate((categories_train[i], categories_train[j]), axis=0)
            #tmp_data = np.concatenate([np.expand_dims(i,axis=0) for i in [y_classes[i],y_classes[j]]])
            
            posiciones_a_cambiar = np.where(tmp_data == y_classes[i]) 
            tmp_data = np.c_[tmp_data[:, :tmp_data.shape[1]-1], np.zeros(tmp_data.shape[0])]
            tmp_data[posiciones_a_cambiar] = 1
            np.random.shuffle(tmp_data)
            
            set_train, set_test = train_test(tmp_data)
            X_train, y_train = divide_X_y(set_train)
            X_test, y_test = divide_X_y(set_test)
            y_train = np.reshape(y_train, y_train.shape[0])
            y_test = np.reshape(y_test, y_test.shape[0])
        
            X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
            X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias
        
            theta = np.random.rand(X_train.shape[1])
            X_train = X_train.astype(float)
            y_train = y_train.astype(int)
            W, cost_history = gradient_descent(X_train, y_train, theta, nb_iterations, learn_rate)
            W_array.append(W)
            
            X_test = X_test.astype('float')
            y_test = y_test.astype(int)
            accuracy_test = accuracy(X_test, y_test, W)
            accuracy_array_test.append(accuracy_test)
            
    #print(W_array)
    print('Iris-setosa vs Iris-versicolor, Iris-setosa vs Iris-virginica, Iris-versicolor vs virginica')
    print(accuracy_array_test)
    
multiclass1()

In [238]:
def multiclass1_cross_validation():
    filename = 'Iris.csv'
    data = pd.read_csv(filename)

    learn_rate = 0.1 # DEFAULT
    nb_iterations = 1000
    W_array = [] 
    accuracy_array_test = []
    k = 3

    X, y = divide_X_y(data.values)
    y_classes = np.unique(y)
    X_normalizada = Normalizar_Datos(X)
    data_normalizada  = np.concatenate((X_normalizada , y), axis=1)
    train, test = train_test(data_normalizada) # verificar la division random de train and test
    categories_train = []
    for i in range(0, y_classes.shape[0]):
        category = train[train[:, train.shape[1]-1] == y_classes[i]]
        categories_train.append(category)  
    # k_set_X[0], k_set_y[0] return first element of e/array
    #k_set_X, k_set_y = create_k_folds(pd.DataFrame(data_normalizada), k)    
    k_set = create_k_folds(pd.DataFrame(data_normalizada), k)
    
    for c in range(y_classes.shape[0]):
        for c1 in range(c+1, y_classes.shape[0]):
            tmp_data = np.concatenate((categories_train[c], categories_train[c1]), axis=0)
            
            total_accuracy_test = 0
            for i_test in range(0, k): 
                X_test = np.zeros ( (0, np.size(k_set[0]['X'])) )
                y_test = np.zeros ( (0, np.size(k_set[0]['y'])) )
                X_train = np.zeros( (0, (np.size(k_set[0]['X'])) * (y_classes.shape[0]-1)) )
                y_train = np.zeros( (0, (np.size(k_set[0]['y'])) * (y_classes.shape[0]-1)) )
                
                for j in range(0, k): 
                    if (i_test == j):
                        X_test = k_set[i_test]['X']
                        y_test = k_set[i_test]['y'] == y_classes[c]
                        #y_train[ np.where( k_set[i_test]['y'] == y_classes[c]) ] = 1
                    else:
                        X_train = k_set[j]['X']
                        y_train = k_set[j]['y'] == y_classes[c]  
                
                y_train = np.reshape(y_train, y_train.shape[0])
                y_test = np.reshape(y_test, y_test.shape[0])

                X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
                X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias

                theta = np.random.rand(np.size(X_train[0]))
                X_train = X_train.astype(float)
                y_train = y_train.astype(int)
                W, cost_history = gradient_descent(X_train, y_train, theta, nb_iterations, learn_rate)
                W_array.append(W)

                X_test = X_test.astype('float')
                y_test = y_test.astype(int)
                accuracy_test = accuracy(X_test, y_test, W)
                total_accuracy_test += accuracy_test

            total_accuracy_test = total_accuracy_test / k
            accuracy_array_test.append(accuracy_test)

    print('Iris-setosa, Iris-versicolor, Iris-virginica')
    print(accuracy_array_test)
    
multiclass1_cross_validation()    

Iris-setosa, Iris-versicolor, Iris-virginica
[1.0, 1.0, 0.7]


One vs All

In [169]:
def multiclass2():
    filename = 'Iris.csv'
    data = pd.read_csv(filename)

    learn_rate = 0.1 # DEFAULT
    nb_iterations = 1000
    data = data.sample(frac=1)
    k = 3

    X, y = divide_X_y(data.values)
    X_normalizada = Normalizar_Datos(X)
    data_normalizada  = np.concatenate((X_normalizada , y), axis=1)
    train, test = train_test(data_normalizada) 

    W_array = [] 
    y_classes = np.unique(y)
    accuracy_array_test = []

    for c in range(y_classes.shape[0]):  
        tmp_data = np.c_[train[:, :train.shape[1]-1], np.zeros(train.shape[0])]
        tmp_data[np.where(train == y_classes[c])] = 1
        
        set_train, set_test = train_test(tmp_data)
        X_train, y_train = divide_X_y(set_train)
        X_test, y_test = divide_X_y(set_test)
        y_train = np.reshape(y_train, y_train.shape[0])
        y_test = np.reshape(y_test, y_test.shape[0])
        
        X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
        X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias
        
        theta = np.random.rand(X_train.shape[1])
        X_train = X_train.astype(float)
        y_train = y_train.astype(int)
        W, cost_history = gradient_descent(X_train, y_train, theta, nb_iterations, learn_rate)
        W_array.append(W)
            
        X_test = X_test.astype('float')
        y_test = y_test.astype(int)
        accuracy_test = accuracy(X_test, y_test, W)
        accuracy_array_test.append(accuracy_test)
        
    print('Iris-setosa, Iris-versicolor, Iris-virginica')
    print(accuracy_array_test)
    
multiclass2()    

Iris-setosa, Iris-versicolor, Iris-virginica
[1.0, 0.78125, 0.96875]


In [227]:
def multiclass2_cross_validation():
    filename = 'Iris.csv'
    data = pd.read_csv(filename)

    learn_rate = 0.1 # DEFAULT
    nb_iterations = 1000
    data = data.sample(frac=1)
    k = 3

    X, y = divide_X_y(data.values)
    X_normalizada = Normalizar_Datos(X)
    data_normalizada  = np.concatenate((X_normalizada , y), axis=1)
        
    k_set = create_k_folds(pd.DataFrame(data_normalizada), k)
   
    W_array = [] 
    y_classes = np.unique(y)
    accuracy_array_test = []
    
    for c in range(y_classes.shape[0]):
        total_accuracy_test = 0
        for i_test in range(0, k): 
            X_test = np.zeros ( (0, np.size(k_set[0]['X'])) )
            y_test = np.zeros ( (0, np.size(k_set[0]['y'])) )
            X_train = np.zeros( (0, (np.size(k_set[0]['X'])) * (y_classes.shape[0]-1)) )
            y_train = np.zeros( (0, (np.size(k_set[0]['y'])) * (y_classes.shape[0]-1)) )
     
            for j in range(0, k): 
                if (i_test == j):
                    X_test = k_set[i_test]['X']
                    y_test = k_set[i_test]['y'] == y_classes[c]
                    #y_train[ np.where( k_set[i_test]['y'] == y_classes[c]) ] = 1
                else:
                    X_train = k_set[j]['X']
                    y_train = k_set[j]['y'] == y_classes[c]                   

            y_train = np.reshape(y_train, y_train.shape[0])
            y_test = np.reshape(y_test, y_test.shape[0])
            
            X_train = np.c_[X_train, np.ones(X_train.shape[0])]     #bias
            X_test = np.c_[X_test, np.ones(X_test.shape[0])]        #bias
          
            theta = np.random.rand(np.size(X_train[0]))
            X_train = X_train.astype(float)
            y_train = y_train.astype(int)
            W, cost_history = gradient_descent(X_train, y_train, theta, nb_iterations, learn_rate)
            W_array.append(W)

            X_test = X_test.astype('float')
            y_test = y_test.astype(int)
            accuracy_test = accuracy(X_test, y_test, W)
            total_accuracy_test += accuracy_test
            
        total_accuracy_test = total_accuracy_test / k
        accuracy_array_test.append(accuracy_test)

    print('Iris-setosa, Iris-versicolor, Iris-virginica')
    print(accuracy_array_test)
    
multiclass2_cross_validation()    

Iris-setosa, Iris-versicolor, Iris-virginica
[1.0, 0.7, 0.9533333333333334]
