In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt

from features import PreprocessData
from linear_regression import linear_closed_form, linear_gradient_descent

ppd = PreprocessData()

# Split dataset
train, validation, test = ppd.preprocess_data(ppd.data)

In [2]:
# Compute most common words from 
ppd.compute_most_common_words(train)

## Experiments
### 1 - Compare runtime, stability and performance of closed-form and gradient descent

In [18]:
# Compute features on training set
start = time.time()
X_train, y_train = ppd.compute_features(train, simple=True)
feat_train_runtime = time.time() - start

# Compute features on validation set
start = time.time()
X_valid, y_valid = ppd.compute_features(validation, simple=True)
feat_valid_runtime = time.time() - start

def test_closed_vs_gradient(X_train, y_train, X_valid, y_valid, hyperparams, rand_init=True):
    # Train using closed form method
    start = time.time()
    w_closed = linear_closed_form(X_train, y_train)
    w_closed_runtime = time.time() - start
    
    # Train using gradient descent
    # Hyperparameters
    w_init = np.random.rand(X_train.shape[1]) if rand_init else np.zeros(X_train.shape[1])
    decay_speed = hyperparams['decay_speed']
    learn_rate = hyperparams['learn_rate']
    min_err = hyperparams['min_err']
    max_iter = hyperparams['max_iter']

    start = time.time()
    w_grad = linear_gradient_descent(X_train, y_train, w_init, decay_speed, learn_rate, min_err, max_iter)
    w_grad_runtime = time.time() - start
    
    # Compute MSE on training set
    y_closed_train = np.matmul(X_train, w_closed)
    mse_closed_train = np.sum((y_closed_train - y_train)**2)/len(y_train)

    y_grad_train = np.matmul(X_train, w_grad)
    mse_grad_train = np.sum((y_grad_train - y_train)**2)/len(y_train)
    
    # Compute MSE on validation set
    y_closed_valid = np.matmul(X_valid, w_closed)
    mse_closed_valid = np.sum((y_closed_valid - y_valid)**2)/len(y_valid)

    y_grad_valid = np.matmul(X_valid, w_grad)
    mse_grad_valid = np.sum((y_grad_valid - y_valid)**2)/len(y_valid)
    
    return {'train': {'closed': mse_closed_train, 'grad': mse_grad_train}, 'validation': {'closed': mse_closed_valid, 'grad': mse_grad_valid}, 'runtime': {'closed': w_closed_runtime, 'grad': w_grad_runtime}}

#### 1.1 - Testing random w0 vs. zero w0

In [24]:
hyperparams = {'decay_speed': 10**(-12), 'learn_rate': 10**(-4), 'min_err': 10**(-7), 'max_iter': 10000000}
perf_rand = test_closed_vs_gradient(X_train, y_train, X_valid, y_valid, hyperparams, rand_init=True)
perf_zero = test_closed_vs_gradient(X_train, y_train, X_valid, y_valid, hyperparams, rand_init=False)
print(perf_rand)
print(perf_zero)

  w_curr = w_prev - 2*curr_learn_rate*(np.matmul(xtx_product, w_prev) - xty_product)


Error: nan | Learning rate: 9.9999990000001e-05
Error: nan | Learning rate: 9.999998000000401e-05
Error: nan | Learning rate: 9.999997000000902e-05
Error: nan | Learning rate: 9.999996000001601e-05
Error: nan | Learning rate: 9.999995000002499e-05
Error: nan | Learning rate: 9.999994000003601e-05


KeyboardInterrupt: 

In [23]:
print(((perf_rand['runtime']['closed'] - perf_zero['runtime']['closed'])/perf_zero['runtime']['closed'])*100)
print(((perf_rand['runtime']['grad'] - perf_zero['runtime']['grad'])/perf_zero['runtime']['grad'])*100)

615.7754010695188
63.18164634722616


Runtime improvement on closed form method when using random initialization of weights vector is 2920% compared to zero initialization. As for the gradient descent method, the impact is much smaller at only 12%.