In [252]:
import numpy as np
import pandas as pd
import math
import random
# set the random seed to get the same random values when initializing thetas
# useful during testing
random.seed = 1

set outputs window size if needed 

In [253]:
# pd.set_option('display.height', 1000)
# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)

In [254]:
def load_normalize_data(data_path):
    # initialize column names
    col_names = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7',
                 'x8', 'x9', 'x10', 'x11', 'x12', 'x13',
                 'x14', 'x15', 'x16', 'y']
    # read_csv file
    data_train = pd.read_csv(data_path, delim_whitespace=True, names=col_names)
    # normalize the data 
    std_dev = data_train.std()
    mean = data_train.mean()
    std_all = std_dev.std()
    mean_all = mean.mean()
    normal_train = (data_train - mean_all)/std_all
    # divide the data into data and label
    x_train = normal_train.iloc[:,:16]
    y = normal_train['y']
    
    #return final data
    return x_train, y

In [255]:
# intialize Thetas
# takes the dataset as the input and returns a list of random theta values that
# corresponds to n_features * degree + 1 
def thetas(dataset, degree):
    theta = np.array([])
    one_row = dataset.iloc[0]
    for l in range((len(one_row)*degree)+1):
        theta = np.append(theta, [random.uniform(-1,1)])
#         theta = np.append(theta, [1])
    return theta

In [256]:
# calculate loss
def loss_function(th, x, y, degree):
    total_loss = 0
    m = len(x)
    for i in range(m):
        # add x0 = 1
        row = np.array([1])
        row = np.append(row, x.iloc[i])
        # calculate y_hat
        y_hat = hypothesis(row, th, degree)
        
        y_true = y[i]
        loss = y_hat - y_true
        total_loss += loss*loss
    return (1/(2*m))*total_loss

In [272]:
def hypothesis(x, w, degree):
    h = np.array([float(x) for x in range(len(w))])
    h[0] = x[0]*w[0]
    i = 1
    # loop 1 through features
    for feat in range(1, len(x)):
        # loop 2 degree
        for d in range(1, degree+1):
            h[i] = (x[feat]**d) * w[i]
            i+= 1
    # find sum(h) as the final value of h(x)
    return sum(h)


In [328]:
# update thetas
def update(theta, alpha, x_train, y):
    # calculate some constant values 
    m = len(x_train)
    constant = alpha/m
#     print('constant is ', constant)
    
    # create an empty temp array to hold theta values 
    new_theta = np.array([])
    
    # iterate through theta and add the updated values to temp 
    
    # loop 1: cycle through theta values
    for t in range(len(theta)):
#         print('   updating theta',t)
        # grab the column associated with x(i)
        # if theta_0 return a column of ones
        if t == 0:
            x_i = np.ones(m)
        else:
            x_i = x_train.iloc[:,t-degree-1]
        # loop 2 sum(find y_hat - h) * x from i = 0 to m
        value = 0
        for i in range(m):
            # for each row in x_train add x(0)=1
                row = np.array([1])
                row = np.append(row, x_train.iloc[i])
                # find y_hat
                y_hat = hypothesis(row, theta, degree)
                # sum the value of ((y_hat - y_true) * x(i) ) for all the rows
                value += (y_hat - y[i]) * x_i[i]
        # update theta and save it into temp value
        temp_th = theta[t] - (constant*value)
        # update the temp thetas
        new_theta = np.append(new_theta, temp_th)
#         print('   theta',t, ' was ', theta[t], ' updated to ', temp_th, ' using ', value)
        
    # return the updated value of thetas
    return new_theta


## Start here

#### Training Stage

In [332]:
training_path = 'UCI_Dataset/pendigits_training.txt'

# select degree and Learning rate
degree = 1
alpha = 0.01
#load data
x_train, y = load_normalize_data(training_path)

# initialize thetas to random values between -1 and 1
theta = thetas(x_train, degree)

if the model took a long time to converge change the threshold to 1 as indicated below and uncomment the cell below

In [333]:
x_train = x_train.loc[0:100,]

start implementation here

In [334]:
# calculate loss 
initial_loss = loss_function(theta, x_train, y, degree)
print('initial loss is: ', initial_loss, ' searching for solution...')

# do initial update
new_theta = update(theta, alpha, x_train, y)
updated_loss = loss_function(new_theta, x_train, y, degree)

# check if the initial update yielded a better value for loss
if updated_loss < initial_loss:
    print('loss improved from ', initial_loss, ' to ', updated_loss)
    print('--- assigning new theta values:')
    theta = new_theta
    current_loss = updated_loss
    loss = current_loss
    # keep updating thetas until convergence ---------------- change to >= 1 if needed
    while (True and loss >= 1):
        temp_theta = update(theta, alpha, x_train, y)
        loss = loss_function(temp_theta, x_train, y, degree)
        if loss < current_loss:
            print()
            print('loss improved from ', current_loss, ' to ', loss)
            print('--- assigning new theta values:')
            current_loss = loss
            theta = temp_theta
        else:
            break
    final_theta = theta
    print()
    print('---------------------------------------------------------------')
    print('best solution is reached: ')
    print('THETA VALUES ARE: ')
    for i in range(len(final_theta)):
        print('theta %d = %f' % (i, final_theta[i]))
    print('finish')
else: 
    final_theta = theta
    print()
    print('---------------------------------------------------------------')
    print('best solution is reached: ')
    print('THETA VALUES ARE: ')
    for i in range(len(final_theta)):
        print('theta %d = %f' % (i, final_theta[i]))



initial loss is:  78.81463355366672  searching for solution...
loss improved from  78.81463355366672  to  31.743392734671406
--- assigning new theta values:

loss improved from  31.743392734671406  to  16.38904690767751
--- assigning new theta values:

loss improved from  16.38904690767751  to  10.440119043530963
--- assigning new theta values:

loss improved from  10.440119043530963  to  7.735540548729208
--- assigning new theta values:

loss improved from  7.735540548729208  to  6.3026858056146695
--- assigning new theta values:

loss improved from  6.3026858056146695  to  5.42843568412138
--- assigning new theta values:

loss improved from  5.42843568412138  to  4.828983857872668
--- assigning new theta values:

loss improved from  4.828983857872668  to  4.381347872619832
--- assigning new theta values:

loss improved from  4.381347872619832  to  4.027409356005741
--- assigning new theta values:

loss improved from  4.027409356005741  to  3.737022480273085
--- assigning new theta va

KeyboardInterrupt: 

#### Testing Stage

In [289]:
# load testing data
testing_path = 'UCI_Dataset/pendigits_test.txt'

# load and normalize data
x_test, y_test = load_normalize_data(testing_path)

change the number of rows below to limit the dataset size if needed

In [290]:
# x_test = x_train.loc[0:100,]
# x_test.head()

In [291]:
# y_test.head()

In [292]:
# run the model on the testing data
m = len(x_test)
id_ = 1
print('id     output      target value      squared error')
# for loop to output data 
for example in range(m):
    row = np.array([1])
    row = np.append(row, x_test.iloc[example])
    y_hat_test = hypothesis(row, final_theta, degree)
    sq_error = (y_hat_test - y_test[example]) ** 2
    print('%d     %f     %f    %f' % (id_, y_hat_test, y_test[example], sq_error))
    id_ += 1

id     output      target value      squared error
1     -10.416384     -4.171658    38.996592
2     -3.635039     -4.171658    0.287960
3     3.513955     -4.171658    59.068652
4     3.556713     -4.067386    58.126879
5     11.342005     -4.067386    237.449314
6     0.101914     -4.901568    25.034835
7     -1.177673     -4.588750    11.635444
8     -1.473028     -4.275931    7.856266
9     6.929799     -4.067386    120.938059
10     8.930411     -4.067386    168.942715
11     -3.957484     -4.067386    0.012078
12     1.064209     -5.005841    36.845507
13     0.383592     -4.797295    26.841590
14     1.448054     -5.005841    41.652766
15     5.589822     -4.484477    101.491498
16     4.470644     -4.067386    72.897947
17     10.967478     -4.693023    245.251282
18     -0.763107     -4.275931    12.339931
19     -0.981477     -5.005841    16.195508
20     4.300215     -4.588750    79.013694
21     8.342683     -4.380204    161.871865
22     4.641929     -4.171658    77.679328

End