In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Pasos
- Cargar datos 
- Normalizar datos 
- Agregar una columna de unos
- Calcular la prediccion (theta tanspuesta por X vector) 
- Calcular el costo (error)
- Dos formas para calcular los parámetros (thetas)
    - Ecuacion normal (X entrenaminto * producto matricil-< inversa - Xt entrenamiento Y(años que vivere))
    - Gradiente Descendiente 

In [2]:
def load_data(filename):
    return pd.read_csv(filename, delim_whitespace=True)

In [3]:
def normalization(data):
    mean_ = data.mean(axis=0) 
    std_ = data.std(axis=0) #estandar
    return (data - mean_)/std_

In [4]:
def train_test(data,col):
    X_train = data[:int((70*len(data))/100), :col]
    y_train = data[:int((70*len(data))/100), col]
    X_test = data[int((70*len(data))/100):, :col]
    y_test = data[int((70*len(data))/100):, col]
    return X_train, y_train, X_test, y_test

In [5]:
# Add a column of ones (bias)
def add_ones(X_train, y_train, X_test, y_test):
    n_exa_train = len(y_train)
    n_exa_test = len(y_test)

    X_train = np.concatenate((np.ones([n_exa_train, 1]), X_train), axis=1)
    X_test = np.concatenate((np.ones([n_exa_test, 1]), X_test), axis=1)
    
    return X_train, X_test

In [6]:
def prediction(theta, X):
    return np.dot(X, theta)

In [7]:
def calculate_cost(X, y, theta):
    #J(theta) = 1/2m sum(h(x) - y)^2
    m = X.shape[0] #nbr of training data 
    pred = prediction(theta, X)
    c = (1/(2*m)) * np.sum(np.square(pred - y))
    return c

In [8]:
def gradient_descent(X, y, theta, nro_iter, learning_rate): 
    m = X.shape[0] #nbr of training data
    #cost_history = np.empty(nro_iter, dtype=float) 
    cost_history = np.zeros(nro_iter) 
    #h = calculate_cost(X, y, theta)
    for i in range(nro_iter):
        pred = prediction(theta, X)
        pred = pred - y        
        theta = theta - (learning_rate * (np.dot(X.T,pred)/m))
        cost_history[i] = calculate_cost( X, y, theta) 
    return theta, cost_history

In [9]:
def normal_equation(X, y):
    theta = np.matmul(np.linalg.inv(np.matmul(X.T, X)), np.matmul(X.T, y))
    #theta = np.linalg.inv(X.T.dot(X)).dot((X.T).dot(y))
    return theta

#### EXPERIMENTO I

In [10]:
def NE_find_parameters():
    fdata = load_data('petrol_consumption.csv')
    data = normalization(fdata)
    data = data.values
    X_train, y_train, X_test, y_test = train_test(data,data.shape[1]-1)
    X_train, X_test = add_ones(X_train, y_train, X_test, y_test)

    theta = normal_equation(X_train, y_train)
    print("Weight Normal Equation Train: ", theta)
    cost = calculate_cost(X_train, y_train, theta)
    print("Cost: ", cost)

    theta = normal_equation(X_test, y_test)
    print("Weight Normal Equation Test: ", theta)
    cost = calculate_cost(X_test, y_test, theta)
    print("Cost: ", cost)

NE_find_parameters()

FileNotFoundError: [Errno 2] No such file or directory: 'petrol_consumption.csv'

In [None]:
def GD_find_parameters():
    fdata = load_data('petrol_consumption.csv')
    data = normalization(fdata)
    data = data.values
    
    X_train, y_train, X_test, y_test = train_test(data,data.shape[1]-1)
    X_train, X_test = add_ones(X_train, y_train, X_test, y_test)

    n_features = X_train.shape[1]
    theta = np.zeros(n_features)
    #theta = np.random.rand(n_features)

    epochs = [2,10,50,100] 
    learn_rates = [ 0.1,  0.5 , 1]
    
    result_train = np.empty([len(learn_rates),len(epochs)])
    result_test = np.empty([len(learn_rates),len(epochs)])
    for epoch in epochs:
        for learn_rate in learn_rates:
            theta_gd, cost_history = gradient_descent(X_train, y_train, theta, epoch, learn_rate)
            error_train = calculate_cost(X_train, y_train, theta_gd)
            error_test = calculate_cost(X_test, y_test, theta_gd)

            print("result_train e ", epoch , "lr ", learn_rate,":", error_train)
            
            result_train = error_train
            result_test = error_test
        print("\n")
    #pdObj = pd.DataFrame(result_train, index = learn_rates, columns = epochs) 
    #pdObj1 = pd.DataFrame(result_test, index = learn_rates, columns = epochs) 
    #return pdObj
GD_find_parameters()

In [None]:
def GD_ploteo():
    fdata = load_data('petrol_consumption.csv')
    data = normalization(fdata)
    data = data.values
    X_train, y_train, X_test, y_test = train_test(data,data.shape[1]-1)
    X_train, X_test = add_ones(X_train, y_train, X_test, y_test)

    n_features = X_train.shape[1]
    #theta = np.zeros(n_features)
    theta = np.random.rand(n_features)

    epochs = [1500,1800,11100,114000,11700,12000,12300,12600,12900,13200]  
    learn_rate = 0.0005

    for epoch in epochs:
        theta_gd, cost_history = gradient_descent(X_train, y_train, theta, epoch, learn_rate)
        theta_gd1, cost_history1 = gradient_descent(X_test, y_test, theta, epoch, learn_rate)
        error_train = calculate_cost(X_train, y_train, theta_gd)
        error_test = calculate_cost(X_test, y_test, theta_gd)
        
        fig, ax = plt.subplots()
        plt.plot(range(len(cost_history)), cost_history)
        plt.plot(range(len(cost_history1)), cost_history1)
        plt.title('TRAIN - TEST '+str(epoch), {'fontsize':10})
        print("Weights of gradient_descent - training data: ", theta_gd, "\n")
        print("Weights of gradient_descent - testing data: ", theta_gd1, "\n")
        ax.grid(True)
        plt.show()
GD_ploteo()

In [None]:
def HNE_find_parameters():
    fdata = pd.read_csv('Housing.csv')
    fdata = fdata.replace(to_replace='yes',value=1,regex=True)
    fdata = fdata.replace(to_replace='no',value=0,regex=True)

    data = normalization(fdata)
    data = data.values

    X_train = data[:int((70*len(data))/100), 1:]
    y_train = data[:int((70*len(data))/100), 0]
    X_test = data[int((70*len(data))/100):, 1:]
    y_test = data[int((70*len(data))/100):, 0]

    X_train, X_test = add_ones(X_train, y_train, X_test, y_test)

    theta = normal_equation(X_train, y_train)
    print("Weight Normal Equation Train: ", theta)
    cost = calculate_cost(X_train, y_train, theta)
    print("Cost: ", cost)

    theta = normal_equation(X_test, y_test)
    print("Weight Normal Equation Test: ", theta)
    cost = calculate_cost(X_test, y_test, theta)
    print("Cost: ", cost)

HNE_find_parameters()

In [None]:
def HGD_find_parameters():
    fdata = pd.read_csv('Housing.csv')
    fdata = fdata.replace(to_replace='yes',value=1,regex=True)
    fdata = fdata.replace(to_replace='no',value=0,regex=True)

    data = normalization(fdata)
    data = data.values

    X_train = data[:int((70*len(data))/100), 1:]
    y_train = data[:int((70*len(data))/100), 0]
    X_test = data[int((70*len(data))/100):, 1:]
    y_test = data[int((70*len(data))/100):, 0]

    X_train, X_test = add_ones(X_train, y_train, X_test, y_test)

    n_features = X_train.shape[1]
    theta = np.zeros(n_features)

    epochs = [2,10,50,100] 
    learn_rates = [0.0005, 0.1,  0.5 , 1]
    
    result_train = np.empty([len(learn_rates),len(epochs)])
    result_test = np.empty([len(learn_rates),len(epochs)])
    for epoch in epochs:
        for learn_rate in learn_rates:
            theta_gd, cost_history = gradient_descent(X_train, y_train, theta, epoch, learn_rate)
            error_train = calculate_cost(X_train, y_train, theta_gd)
            error_test = calculate_cost(X_test, y_test, theta_gd)
            
            print("result_train e ", epoch , "lr ", learn_rate,":", error_train)
            #print ("result_test e ", epoch , "lr ", learn_rate,":", error_test)

        print("\n")
            
        #pdObj = pd.DataFrame(result_train, index = learn_rates, columns = epochs) 
        #pdObj1 = pd.DataFrame(result_test, index = learn_rates, columns = epochs) 
    #return pdObj

HGD_find_parameters()


In [None]:
def HGD_ploteo():
    fdata = pd.read_csv('Housing.csv')
    fdata = fdata.replace(to_replace='yes',value=1,regex=True)
    fdata = fdata.replace(to_replace='no',value=0,regex=True)

    data = normalization(fdata)
    data = data.values

    X_train = data[:int((70*len(data))/100), 1:]
    y_train = data[:int((70*len(data))/100), 0]
    X_test = data[int((70*len(data))/100):, 1:]
    y_test = data[int((70*len(data))/100):, 0]
    X_train, X_test = add_ones(X_train, y_train, X_test, y_test)

    n_features = X_train.shape[1]
    theta = np.zeros(n_features)

    epochs = [1500,1800,11100,114000,11700,12000,12300,12600,12900,13200]  
    learn_rate = 0.0005

    for epoch in epochs:
        theta_gd, cost_history = gradient_descent(X_train, y_train, theta, epoch, learn_rate)
        theta_gd1, cost_history1 = gradient_descent(X_test, y_test, theta, epoch, learn_rate)
        error_train = calculate_cost(X_train, y_train, theta_gd)
        error_test = calculate_cost(X_test, y_test, theta_gd)
        
        fig, ax = plt.subplots()
        plt.plot(range(len(cost_history)), cost_history)
        plt.plot(range(len(cost_history1)), cost_history1)
        plt.title('TRAIN - TEST '+str(epoch), {'fontsize':10})
        print("Weights of gradient_descent - training data: ", theta_gd, "\n")
        print("Weights of gradient_descent - testing data: ", theta_gd1, "\n")
        ax.grid(True)
        plt.show()
HGD_ploteo()