In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt

from tabulate import tabulate
from copy import deepcopy

from features import PreprocessData
from linear_regression import linear_closed_form, linear_gradient_descent

ppd = PreprocessData()

In [2]:
# Split dataset
train, validation, test = ppd.preprocess_data(ppd.data)

# Compute most common words from 
ppd.compute_most_common_words(train)

## Experiments
### 1 - Compare runtime, stability and performance of closed-form and gradient descent

In [3]:
# Compute features on training set
start = time.time()
X_train, y_train = ppd.compute_features(train, simple=True)
feat_train_runtime = time.time() - start
print(X_train.shape)

# Compute features on validation set
start = time.time()
X_valid, y_valid = ppd.compute_features(validation, simple=True)
feat_valid_runtime = time.time() - start
print(X_valid.shape)

print(f'Training features runtime: {feat_train_runtime}')
print(f'Validation features runtime: {feat_valid_runtime}')

(10000, 4)
(1000, 4)
Training features runtime: 0.015549421310424805
Validation features runtime: 0.0011286735534667969


In [4]:
def test_closed_vs_gradient(X_train, y_train, X_valid, y_valid, hyperparams, rand_init=False):
    # Train using closed form method
    start = time.time()
    w_closed = linear_closed_form(X_train, y_train)
    w_closed_runtime = time.time() - start
    
    # Train using gradient descent
    # Hyperparameters
    w_init = np.random.rand(X_train.shape[1]) if rand_init else np.zeros(X_train.shape[1])
    decay_speed = hyperparams['decay_speed']
    learn_rate = hyperparams['learn_rate']
    min_err = hyperparams['min_err']
    max_iter = hyperparams['max_iter']

    start = time.time()
    w_grad = linear_gradient_descent(X_train, y_train, w_init, decay_speed, learn_rate, min_err, max_iter, verbose=True)
    w_grad_runtime = time.time() - start
    
    # Compute MSE on training set
    y_closed_train = np.matmul(X_train, w_closed)
    mse_closed_train = np.sum((y_closed_train - y_train)**2)/len(y_train)

    y_grad_train = np.matmul(X_train, w_grad)
    mse_grad_train = np.sum((y_grad_train - y_train)**2)/len(y_train)
    
    # Compute MSE on validation set
    y_closed_valid = np.matmul(X_valid, w_closed)
    mse_closed_valid = np.sum((y_closed_valid - y_valid)**2)/len(y_valid)

    y_grad_valid = np.matmul(X_valid, w_grad)
    mse_grad_valid = np.sum((y_grad_valid - y_valid)**2)/len(y_valid)
    
    return {'train': {'closed': mse_closed_train, 'grad': mse_grad_train}, 'validation': {'closed': mse_closed_valid, 'grad': mse_grad_valid}, 'runtime': {'closed': w_closed_runtime, 'grad': w_grad_runtime}}

#### 1.1 - Testing random w0 vs. zero w0

In [5]:
hyperparams = {'decay_speed': 10**(-12), 'learn_rate': 10**(-6), 'min_err': 10**(-7), 'max_iter': 1000000}
rand_runtime = []
zero_runtime = []
closed_runtime = []
for _ in range(20):
    perf_rand = test_closed_vs_gradient(X_train, y_train, X_valid, y_valid, hyperparams, rand_init=True)
    perf_zero = test_closed_vs_gradient(X_train, y_train, X_valid, y_valid, hyperparams, rand_init=False)
    rand_runtime.append(perf_rand['runtime']['grad'])
    zero_runtime.append(perf_zero['runtime']['grad'])
    closed_runtime.append(perf_rand['runtime']['closed'])

Error: 4.313354397498971e-05 | Learning rate: 9.999999900000001e-07
Error: 4.291390985825258e-06 | Learning rate: 9.999999800000003e-07
Error: 4.26953950985009e-07 | Learning rate: 9.99999970000001e-07
Finished after 36289 iterations
Error: 2.5246314222317527e-05 | Learning rate: 9.999999900000001e-07
Error: 2.511776109700652e-06 | Learning rate: 9.999999800000003e-07
Error: 2.4989863144651346e-07 | Learning rate: 9.99999970000001e-07
Finished after 33968 iterations
Error: 4.683347237278741e-05 | Learning rate: 9.999999900000001e-07
Error: 4.659499838318843e-06 | Learning rate: 9.999999800000003e-07
Error: 4.635773976343201e-07 | Learning rate: 9.99999970000001e-07
Finished after 36646 iterations
Error: 2.5246314222317527e-05 | Learning rate: 9.999999900000001e-07
Error: 2.511776109700652e-06 | Learning rate: 9.999999800000003e-07
Error: 2.4989863144651346e-07 | Learning rate: 9.99999970000001e-07
Finished after 33968 iterations
Error: 3.530200326741286e-05 | Learning rate: 9.999999900

Error: 2.511776109700652e-06 | Learning rate: 9.999999800000003e-07
Error: 2.4989863144651346e-07 | Learning rate: 9.99999970000001e-07
Finished after 33968 iterations
Error: 3.967769596894791e-05 | Learning rate: 9.999999900000001e-07
Error: 3.9475658878727435e-06 | Learning rate: 9.999999800000003e-07
Error: 3.9274651461060384e-07 | Learning rate: 9.99999970000001e-07
Finished after 35927 iterations
Error: 2.5246314222317527e-05 | Learning rate: 9.999999900000001e-07
Error: 2.511776109700652e-06 | Learning rate: 9.999999800000003e-07
Error: 2.4989863144651346e-07 | Learning rate: 9.99999970000001e-07
Finished after 33968 iterations
Error: 4.762664693023846e-05 | Learning rate: 9.999999900000001e-07
Error: 4.7384134129349196e-06 | Learning rate: 9.999999800000003e-07
Error: 4.714285727259933e-07 | Learning rate: 9.99999970000001e-07
Finished after 36719 iterations
Error: 2.5246314222317527e-05 | Learning rate: 9.999999900000001e-07
Error: 2.511776109700652e-06 | Learning rate: 9.99999

In [21]:
print(f'Average runtime with random init: {sum(rand_runtime)/len(rand_runtime)}')
print(f'Average runtime with zero init: {sum(zero_runtime)/len(zero_runtime)}')
print(f'Average runtime with closed form: {sum(closed_runtime)/len(closed_runtime)}')

print('Closed-form Training MSE: ' + str(perf_rand['train']['closed']))
print(f'Gradient descent Training MSE: ' + str(perf_rand['train']['grad']))

print(f'Closed-form Validation MSE: ' + str(perf_rand['validation']['closed']))
print(f'Gradient descent Validation MSE: ' + str(perf_rand['validation']['grad']))

Average runtime with random init: 0.34093446731567384
Average runtime with zero init: 0.29845372438430784
Average runtime with closed form: 0.003976774215698242
Closed-form Training MSE: 1.0846830709157251
Gradient descent Training MSE: 1.0846830730809067
Closed-form Validation MSE: 1.0203266848431447
Gradient descent Validation MSE: 1.0203284850171814


#### 1.2 - Testing different hyperparameters

In [7]:
learn_rates = [10**(-5), 10**(-6), 10**(-7), 10**(-8), 10**(-9), 10**(-10)]
decay_speeds = [10**(-5), 10**(-6), 10**(-7), 10**(-8), 10**(-9), 10**(-10)]
min_error = 10**(-7)

train_grad_errors = []
valid_grad_errors = []
grad_runtimes = []
for learn_rate in learn_rates:
    train_err = []
    valid_err = []
    run = []
    for decay_speed in decay_speeds:
        print(f'Testing with learn_rate: {learn_rate} and decay_speed: {decay_speed}')
        hyperparams = {'decay_speed': decay_speed, 'learn_rate': learn_rate, 'min_err': min_error, 'max_iter': 10000000}
        perf = test_closed_vs_gradient(X_train, y_train, X_valid, y_valid, hyperparams)
        train_err.append(perf['train']['grad'])
        valid_err.append(perf['validation']['grad'])
        run.append(perf['runtime']['grad'])
    train_grad_errors.append(train_err)
    valid_grad_errors.append(valid_err)
    grad_runtimes.append(run)

Testing with learn_rate: 1e-05 and decay_speed: 1e-05
Finished after 4469 iterations
Testing with learn_rate: 1e-05 and decay_speed: 1e-06
Finished after 4398 iterations
Testing with learn_rate: 1e-05 and decay_speed: 1e-07
Finished after 4391 iterations
Testing with learn_rate: 1e-05 and decay_speed: 1e-08
Finished after 4391 iterations
Testing with learn_rate: 1e-05 and decay_speed: 1e-09
Finished after 4391 iterations
Testing with learn_rate: 1e-05 and decay_speed: 1e-10
Finished after 4391 iterations
Testing with learn_rate: 1e-06 and decay_speed: 1e-05
Finished after 38483 iterations
Testing with learn_rate: 1e-06 and decay_speed: 1e-06
Finished after 34400 iterations
Testing with learn_rate: 1e-06 and decay_speed: 1e-07
Finished after 34011 iterations
Testing with learn_rate: 1e-06 and decay_speed: 1e-08
Finished after 33973 iterations
Testing with learn_rate: 1e-06 and decay_speed: 1e-09
Finished after 33969 iterations
Testing with learn_rate: 1e-06 and decay_speed: 1e-10
Finish

Error: 2.524758906264195e-07 | Learning rate: 9.999000099990002e-09
Error: 2.0045451010285492e-07 | Learning rate: 9.99890012098669e-09
Error: 1.5915223905127836e-07 | Learning rate: 9.998800143982723e-09
Error: 1.2636030796567468e-07 | Learning rate: 9.998700168978033e-09
Error: 1.0032509991536043e-07 | Learning rate: 9.998600195972564e-09
Finished after 1401406 iterations
Testing with learn_rate: 1e-09 and decay_speed: 1e-05
Error: 1.4391442970057642e-06 | Learning rate: 5e-10
Error: 5.629353336638011e-07 | Learning rate: 3.3333333333333337e-10
Error: 3.458567641434922e-07 | Learning rate: 2.5e-10
Error: 2.466562115481699e-07 | Learning rate: 2e-10
Error: 1.8948677412170268e-07 | Learning rate: 1.6666666666666669e-10
Error: 1.5237092487928034e-07 | Learning rate: 1.4285714285714285e-10
Error: 1.2644047796086387e-07 | Learning rate: 1.25e-10
Error: 1.0738752718252595e-07 | Learning rate: 1.1111111111111112e-10
Finished after 847658 iterations
Testing with learn_rate: 1e-09 and decay_s

Error: 1.596736830581615e-07 | Learning rate: 9.98003992015968e-10
Error: 1.5602213031813664e-07 | Learning rate: 9.979044007584074e-10
Error: 1.52455038086326e-07 | Learning rate: 9.978048293753742e-10
Error: 1.4897012409381437e-07 | Learning rate: 9.9770527786092e-10
Error: 1.455653375690791e-07 | Learning rate: 9.976057462090982e-10
Error: 1.4223875850293564e-07 | Learning rate: 9.975062344139652e-10
Error: 1.389885503151954e-07 | Learning rate: 9.974067424695793e-10
Error: 1.3581293701133852e-07 | Learning rate: 9.973072703700012e-10
Error: 1.327101923931037e-07 | Learning rate: 9.972078181092942e-10
Error: 1.2967863391755982e-07 | Learning rate: 9.971083856815236e-10
Error: 1.2671662006277788e-07 | Learning rate: 9.97008973080758e-10
Error: 1.2382254803536753e-07 | Learning rate: 9.969095803010667e-10
Error: 1.2099485235919385e-07 | Learning rate: 9.968102073365232e-10
Error: 1.1823200432187118e-07 | Learning rate: 9.96710854181202e-10
Error: 1.1553251040744397e-07 | Learning rate

Error: 1.226636004592984e-07 | Learning rate: 9.84251968503937e-11
Error: 1.1709338270674882e-07 | Learning rate: 9.832841691248772e-11
Error: 1.1205266158129184e-07 | Learning rate: 9.823182711198429e-11
Error: 1.0742663727983763e-07 | Learning rate: 9.813542688910698e-11
Error: 1.0313427081888957e-07 | Learning rate: 9.803921568627452e-11
Finished after 2077494 iterations
Testing with learn_rate: 1e-10 and decay_speed: 1e-09
Error: 1.8355996579992087e-06 | Learning rate: 9.999000099990002e-11
Error: 1.2444584761883698e-06 | Learning rate: 9.998000399920017e-11
Error: 8.618600923574366e-07 | Learning rate: 9.997000899730082e-11
Error: 6.146792920264514e-07 | Learning rate: 9.996001599360257e-11
Error: 4.550781597871198e-07 | Learning rate: 9.995002498750626e-11
Error: 3.5167878161498706e-07 | Learning rate: 9.994003597841297e-11
Error: 2.8395921438385576e-07 | Learning rate: 9.993004896572401e-11
Error: 2.3867490527940913e-07 | Learning rate: 9.992006394884093e-11
Error: 2.07463183247

In [8]:
data = deepcopy(train_grad_errors)
for err, learn_rate in zip(data, learn_rates):
    err.insert(0, learn_rate)
table = tabulate(data, headers=['Learn rate/decay speed'] + decay_speeds)
print('MSE for gradient descent on training set:\n')
print(table)

MSE for gradient descent on training set:

  Learn rate/decay speed    1e-05    1e-06    1e-07    1e-08    1e-09    1e-10
------------------------  -------  -------  -------  -------  -------  -------
                   1e-05  1.08468  1.08468  1.08468  1.08468  1.08468  1.08468
                   1e-06  1.08468  1.08468  1.08468  1.08468  1.08468  1.08468
                   1e-07  1.08469  1.08468  1.08468  1.08468  1.08468  1.08468
                   1e-08  1.08883  1.08484  1.08471  1.08471  1.0847   1.0847
                   1e-09  1.10789  1.09433  1.08849  1.08702  1.08687  1.08685
                   1e-10  1.31064  1.13619  1.1141   1.10975  1.10927  1.10922


In [9]:
data = deepcopy(valid_grad_errors)
for err, learn_rate in zip(data, learn_rates):
    err.insert(0, learn_rate)
table = tabulate(data, headers=['Learn rate/decay speed'] + decay_speeds)
print('MSE for gradient descent on validation set:\n')
print(table)

MSE for gradient descent on validation set:

  Learn rate/decay speed    1e-05    1e-06    1e-07    1e-08    1e-09    1e-10
------------------------  -------  -------  -------  -------  -------  -------
                   1e-05  1.02033  1.02033  1.02033  1.02033  1.02033  1.02033
                   1e-06  1.02033  1.02033  1.02033  1.02033  1.02033  1.02033
                   1e-07  1.02043  1.02035  1.02035  1.02034  1.02034  1.02034
                   1e-08  1.02567  1.02091  1.02055  1.02052  1.02052  1.02052
                   1e-09  1.04722  1.03106  1.02533  1.02381  1.02364  1.02362
                   1e-10  1.25706  1.07942  1.0544   1.04938  1.04882  1.04876


In [10]:
data = deepcopy(grad_runtimes)
for runtime, learn_rate in zip(data, learn_rates):
    runtime.insert(0, learn_rate)
table = tabulate(data, headers=['Learn rate/decay speed'] + decay_speeds)
print('Runtime (s) for gradient descent:\n')
print(table)

Runtime (s) for gradient descent:

  Learn rate/decay speed      1e-05       1e-06       1e-07       1e-08       1e-09       1e-10
------------------------  ---------  ----------  ----------  ----------  ----------  ----------
                   1e-05  0.0489182   0.0330427   0.0382464   0.0456505   0.0340171   0.0448442
                   1e-06  0.350077    0.273121    0.30565     0.313023    0.254367    0.254817
                   1e-07  3.12573     1.91857     1.72181     2.00232     1.65269     1.8404
                   1e-08  9.08036    12.3187     11.5575     10.9516     10.3267     10.2265
                   1e-09  6.65009     9.04114    23.757      27.4603     29.5138     29.0245
                   1e-10  6.76558     9.11673    13.4172     14.5175     15.9303     14.4766


### 2 - Compare using different sized text features

### 3 - Demonstrate performance of new features

In [11]:
# Compute most common words from
ppd.compute_most_common_words(train)

In [12]:
# Baseline - no extra features
# Compute features on training set
start = time.time()
X_train, y_train = ppd.compute_features(train, extra_features=False, num_word_features=60)
feat_train_runtime = time.time() - start
print(X_train.shape)

# Compute features on validation set
start = time.time()
X_valid, y_valid = ppd.compute_features(validation, extra_features=False, num_word_features=60)
feat_valid_runtime = time.time() - start
print(X_valid.shape)

print(f'Training features runtime: {feat_train_runtime}')
print(f'Validation features runtime: {feat_valid_runtime}')

(10000, 64)
(1000, 64)
Training features runtime: 0.1219944953918457
Validation features runtime: 0.014257192611694336


In [13]:
# Gradient descent
w_init = np.zeros(X_train.shape[1])
# w_grad = linear_gradient_descent(X_train, y_train, w_init, decay_speed=10**(-10), learn_rate=10**(-8), min_err=10**(-7), max_iter=10000000, verbose=True)
w_grad = linear_closed_form(X_train, y_train)

# Compute MSE on validation set
y_grad_valid = np.matmul(X_valid, w_grad)
mse_grad_valid = np.sum((y_grad_valid - y_valid)**2)/len(y_valid)
print(mse_grad_valid)

0.9839397297217662


In [14]:
# Extra features
# Compute features on training set
start = time.time()
X_train_extra, y_train_extra = ppd.compute_features(train, extra_features=True, num_word_features=60)
feat_train_runtime = time.time() - start
print(X_train_extra.shape)

# Compute features on validation set
start = time.time()
X_valid_extra, y_valid_extra = ppd.compute_features(validation, extra_features=True, num_word_features=60)
feat_valid_runtime = time.time() - start
print(X_valid_extra.shape)

print(f'Training features runtime: {feat_train_runtime}')
print(f'Validation features runtime: {feat_valid_runtime}')

(10000, 69)
(1000, 69)
Training features runtime: 0.44202446937561035
Validation features runtime: 0.09993243217468262


In [15]:
# Gradient descent
w_init_extra = np.zeros(X_train_extra.shape[1])
# w_grad_extra = linear_gradient_descent(X_train_extra, y_train_extra, w_init_extra, decay_speed=10**(-10), learn_rate=10**(-8), min_err=10**(-7), max_iter=10000000, verbose=True)
w_grad_extra = linear_closed_form(X_train_extra, y_train_extra)

# Compute MSE on validation set
y_grad_valid_extra = np.matmul(X_valid_extra, w_grad_extra)
mse_grad_valid_extra = np.sum((y_grad_valid_extra - y_valid_extra)**2)/len(y_valid_extra)
print(mse_grad_valid_extra)

0.9818548715476901


In [16]:
print(mse_grad_valid - mse_grad_valid_extra)

0.0020848581740761096
