In [1]:
# Render plots inline
%matplotlib inline

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random as rand
import json
from mlflow import log_metric, log_param, log_artifact
# adjust dimensions of plot area to make it look better
plt.rcParams['figure.figsize'] = (15, 5)

In [3]:
def make_df(csv_path):
    """
    Generates a pandas DataFrame for the
    data given in the csv file specified
    by the 'csv_path'.

    Argument:
        csv_path - specifies path to desired csv file
    Returns:
        pandas DataFrame with data from csv file specified in csv_path

    """
    return pd.read_csv(csv_path, header=None, names=['x', 'y'], index_col=False)

In [4]:
def make_data_matrix(x_values, degree):
    """
    Generates a N x (degree+1) dimension
    matrix from the input values (x values)
    of the given dataset. The ith feature (column)
    in the matrix (i=0 to degree+1 (exclusive))
    is - the N input points raised to the power i.

    Arguments:
        x_values - list containing the input data points
        degree - desired degree of the polynomial
    Returns:
        An Nx(degree+1) matrix where ith feature/column
        is the N data points raised to the power i
    """
    X_data = []
    for x_val in x_values:
        col = []
        for power in range(degree+1):
            col.append(x_val ** power)
        X_data.append(col)
    X_matrix = np.array(X_data)
    return X_matrix

In [5]:
def find_predictions(data_matrix, params_vector):
    """
    Gives a Nx1 vector where each entry is the predicted
    value for y, which is calculated using the parameters vector
    obtained from the find_params_vector() method and a row/example
    from the data matrix.

    Arguments:
        data_matrix - matrix generated by the make_data_matrix()
                        method using the input data points - An 
                        Nx(degree+1) matrix where ith feature/column
                        is the N data points raised to the power i
        params_vector - vector containing the values for the parameters that
                        minimize the error function (i.e. solution obtained for w*)
                        from the find_params_vector() method
    """
    return np.array([row.dot(params_vector) for row in data_matrix])

In [6]:
def find_params_over_epochs(step_size, data_x, data_y, num_examples, proportion):
    w0 = 0
    w1 = 0
    stop = False
    count = 0
    params_over_epochs = {}
    while not stop:
        params_over_epochs[count] = [w0, w1]
        for i in range(num_examples):
            idx = rand.randint(0, num_examples-1)
            prediction = w0 + w1*data_x[idx]
            error = prediction - data_y[idx]

            new_w0 = w0 - step_size*error
            new_w1 = w1 - step_size*error*data_x[idx]

            diff_w0 = abs(new_w0 - w0)
            diff_w1 = abs(new_w1 - w1)
            if diff_w0 < proportion*w0 and diff_w1 < proportion*w1:
                stop = True
                break
            w0 = new_w0
            w1 = new_w1

        count += 1

    return params_over_epochs

In [7]:
def find_mse(data_matrix, params_vector, y_values):
    """
    Steps for finding the Mean Square Error - 
    1. Get the prediction values for the input
        data points based on the model found.
    2. Calculate the error vector by differencing
        the vector containing target values and 
        the vector containing the predicted values.
    3. Take the square of the errors found and sum them 
        by taking dot product of the error vector with itself.
    4. Calculate the MSE by dividing the sum of the
        squares of errors with the number of input data points.

    Arguments:
        data_matrix - matrix generated by the make_data_matrix()
                        method using the input data points - An 
                        Nx(degree+1) matrix where ith feature/column
                        is the N data points raised to the power i
        params_vector - vector containing the values for the parameters that
                        minimize the error function (i.e. solution obtained for w*)
                        from the find_params_vector() method
        y_values - list of output data points from given dataset
    Returns:
        Mean Square Error of model for given data set
    """
    y_predictions = find_predictions(data_matrix, params_vector)
    error = y_values - y_predictions
    error_square = error.dot(error)
    mse = error_square/len(y_values)
    return mse

In [8]:
def find_list_mses(data_matrix, params_over_epochs, y_values):
    params_keys = sorted(params_over_epochs.keys())
    dataset_list_mses = []
    for key in params_keys:
        params = params_over_epochs[key]
        dataset_mse = find_mse(data_matrix, params, y_values)
        dataset_list_mses.append(dataset_mse)

    return dataset_list_mses

In [9]:
######## PART 3.1 ########

In [10]:
###### 3.1.(a) ######

In [11]:
init_step_size = 10 ** (-6)

In [12]:
train_df = make_df('./Datasets/Dataset_2_train.csv')

In [13]:
train_x = train_df['x']
train_y = train_df['y']
num_examples = len(train_x)

In [14]:
train_matrix = make_data_matrix(train_x, 1)

In [None]:
params_over_epochs = find_params_over_epochs(
    init_step_size, train_x, train_y, num_examples, 0.0000001)

In [None]:
# for key, p in params_over_epochs.items():
#     print(key, ':', p)

In [15]:
valid_df = make_df('./Datasets/Dataset_2_valid.csv')

In [16]:
valid_x = valid_df['x']
valid_y = valid_df['y']

In [17]:
valid_matrix = make_data_matrix(valid_x, 1)

In [None]:
list_valid_mses = np.array(find_list_mses(
    valid_matrix, params_over_epochs, valid_y))

In [None]:
# for idx, valid_mse in enumerate(list_valid_mses):
#     print(idx, ':', valid_mse)

In [None]:
###### 3.1.(b) ######

In [None]:
list_train_mses = np.array(find_list_mses(
    train_matrix, params_over_epochs, train_y))

In [None]:
plt.plot(sorted(params_over_epochs.keys()),
         list_valid_mses, 'r', label='Valid MSEs')
plt.plot(sorted(params_over_epochs.keys()),
         list_train_mses, 'b', label='Train MSEs')
plt.xlabel('epoch')
plt.ylabel('MSE')
plt.legend()
plt.show()

In [None]:
# print(sorted(params_over_epochs.keys())[-1])
# print(max(params_over_epochs.keys()))

In [None]:
print(list_train_mses.shape)
# print(list_train_mses[-10:])

In [None]:
# print(list_valid_mses[-10:])

In [18]:
######## PART 3.2 ########

In [19]:
###### 3.2.(a) ######

In [20]:
list_step_sizes = [1, 0.5, 10**(-1), 10**(-1)/2, 10**(-2), 10**(-2)/2, 10**(-3), 10**(-3)/2, 10**(-4), 10**(-4)/2, 10**(-5), 10**(-6)]
valid_mse_step_size = {'step_size': list_step_sizes, 'valid_mse': [], 'params': []}

In [21]:
for step_size in list_step_sizes:
    params_over_epochs = find_params_over_epochs(step_size, train_x, train_y, num_examples, 10**(-7))
    final_key = max(params_over_epochs.keys())
    final_params = params_over_epochs[final_key]
    valid_mse_step_size['params'].append(final_params)
    valid_mse = find_mse(valid_matrix, final_params, valid_y)
    valid_mse_step_size['valid_mse'].append(valid_mse)
    print("Done:", step_size)

Done: 1
Done: 0.5
Done: 0.1
Done: 0.05
Done: 0.01
Done: 0.005
Done: 0.001
Done: 0.0005
Done: 0.0001
Done: 5e-05
Done: 1e-05
Done: 1e-06


In [22]:
valid_mse_step_size_df = pd.DataFrame(data=valid_mse_step_size)

In [23]:
valid_mse_step_size_df

Unnamed: 0,params,step_size,valid_mse
0,"[4.310775447675662, 4.36888123126003]",1.0,0.651328
1,"[3.292143322152323, 4.3840270191332875]",0.5,0.133412
2,"[3.628344431429847, 4.280749494780331]",0.1,0.075875
3,"[3.6426671231482097, 4.415878511174687]",0.05,0.088735
4,"[3.5663352004641182, 4.312389217423121]",0.01,0.074937
5,"[3.600326914431573, 4.3175002455709794]",0.005,0.074219
6,"[3.965176857989813, 3.8687084212569856]",0.001,0.134706
7,"[3.5565673662590345, 3.1562208826240354]",0.0005,1.202529
8,"[3.389656864864175, 3.035003500654583]",0.0001,1.795989
9,"[3.1005359022660097, 2.7575664542086926]",5e-05,3.36014


In [24]:
log_param("Step size - Valid MSE - Parameters", valid_mse_step_size_df)

In [33]:
# log_metric("3.2.(a) Table", valid_mse_step_size_df)

0      [4.310775447675662, 4.36888123126003]   1.000000   0.651328
1    [3.292143322152323, 4.3840270191332875]   0.500000   0.133412
2     [3.628344431429847, 4.280749494780331]   0.100000   0.075875
3    [3.6426671231482097, 4.415878511174687]   0.050000   0.088735
4    [3.5663352004641182, 4.312389217423121]   0.010000   0.074937
5    [3.600326914431573, 4.3175002455709794]   0.005000   0.074219
6    [3.965176857989813, 3.8687084212569856]   0.001000   0.134706
7   [3.5565673662590345, 3.1562208826240354]   0.000500   1.202529
8     [3.389656864864175, 3.035003500654583]   0.000100   1.795989
9   [3.1005359022660097, 2.7575664542086926]   0.000050   3.360140
10  [3.0593725149934956, 2.7236930948403915]   0.000010   3.607051
11   [2.8509653236544223, 2.528674709568887]   0.000001   5.090258 was not logged because the value is not a number.


In [None]:
# plt.plot(valid_mse_step_size_df['step_size'], valid_mse_step_size_df['valid_mse'], 'b', label='Valid MSEs')
# plt.xlabel('Step Size')
# plt.ylabel('Valid MSE')
# plt.legend()
# plt.show()

In [None]:
###### 3.2.(b) ######

In [25]:
valid_mses = valid_mse_step_size_df['valid_mse']
print(type(valid_mses))
idx_min_valid_mse = valid_mses.idxmin
best_step_size = valid_mse_step_size_df['step_size'][idx_min_valid_mse]

<class 'pandas.core.series.Series'>


In [26]:
best_step_size

0.005

In [27]:
test_df = make_df('./Datasets/Dataset_2_test.csv')
test_x = test_df['x']
test_y = test_df['y']

In [28]:
test_matrix = make_data_matrix(test_x, 1)

In [29]:
test_params = valid_mse_step_size_df['params'][idx_min_valid_mse]

In [30]:
test_mse = find_mse(test_matrix, test_params, test_y)

In [31]:
test_mse

0.06929559444048429

In [38]:
log_param("3.2.b Test MSE", test_mse)

Exception: Unexpected data for param 'Step size - Valid MSE - Parameters'. Param recorded more than once

In [32]:
######## PART 3.3 ########