In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt

from features import PreprocessData
from linear_regression import linear_closed_form, linear_gradient_descent

ppd = PreprocessData()

# Split dataset
train, validation, test = ppd.preprocess_data(ppd.data)

In [None]:
# Compute most common words from 
ppd.compute_most_common_words(train)

In [2]:
# Compute features on training set
start = time.time()
X_train, y_train = ppd.compute_features(train)
print(f'Training features runtime: {time.time() - start}')
print(X_train.shape)
print(y_train.shape)

Training features runtime: 54.67300200462341
(10000, 169)
(10000,)


In [3]:
# Compute features on validation set
start = time.time()
X_valid, y_valid = ppd.compute_features(validation)
print(f'Validation features runtime: {time.time() - start}')
print(X_valid.shape)
print(y_valid.shape)

Validation features runtime: 5.649372577667236
(1000, 169)
(1000,)


In [4]:
# Train using closed form method
start = time.time()
w_closed = linear_closed_form(X_train, y_train)
print(f'Closed form runtime: {time.time() - start}')
print(w_closed.shape)
print(w_closed)

Closed form runtime: 0.0246889591217041
(169,)
[ 3.72813474e-01 -1.07171749e+00 -2.39840127e-01 -5.70469727e-03
 -1.98003833e-02 -1.17976144e-02 -1.69695127e-02  1.48413874e-02
  8.09634692e-04 -5.38523613e-02  5.68031632e-02  2.91587521e-02
  2.25523375e-02  1.64385796e-02 -4.78980509e-02  5.03499956e-02
  3.17068669e-02 -5.74007125e-02 -4.74424862e-02  5.08230645e-02
  6.51800548e-02  3.96989382e-02 -4.61523244e-03  3.33011297e-02
 -3.18013480e-02 -2.81391696e-02 -2.32666644e-02  3.67059683e-02
  7.28833031e-02  2.52553794e-02 -1.04995055e-02 -3.54680688e-03
  2.02486171e-02  4.38487727e-02 -2.61883987e-02  3.79187237e-02
 -1.11101936e-02  5.46382715e-02 -8.56457285e-02  2.59442450e-03
  2.78535429e-02  7.55701091e-03 -3.09200064e-02  1.74893596e-03
  5.52236581e-02  5.03940857e-02 -2.94118240e-02 -5.84167548e-02
  8.21143376e-02 -8.77610552e-03  2.90286854e-02  3.39495642e-02
  1.73170477e-02 -8.35047328e-03  8.88709878e-03 -4.90170738e-02
 -3.26870339e-02  1.41099396e-03 -3.3150442

In [5]:
# Train using gradient descent

# Hyperparameters
# w_init = np.zeros(X_train.shape[1])
w_init = np.random.rand(X_train.shape[1])
decay_speed = 10**(-12)
learn_rate = 4*10**(-8)
min_err = 10**(-7)
max_iter = 10000000

start = time.time()
w_grad = linear_gradient_descent(X_train, y_train, w_init, decay_speed, learn_rate, min_err, max_iter)
print(w_grad.shape)
print(w_grad)
print(f'Gradient descent runtime: {time.time() - start}')

Error: 1.7443156336966558e-05 | Learning rate: 3.9999996000000397e-08
Error: 4.252082778944286e-06 | Learning rate: 3.9999992000001606e-08
Error: 1.4317652289389837e-06 | Learning rate: 3.9999988000003604e-08
Error: 6.318267645854112e-07 | Learning rate: 3.99999840000064e-08
Error: 3.5092620684532515e-07 | Learning rate: 3.999998000001e-08
Error: 2.2777086473070577e-07 | Learning rate: 3.9999976000014406e-08
Error: 1.6050279296046633e-07 | Learning rate: 3.99999720000196e-08
Error: 1.1759602040397989e-07 | Learning rate: 3.99999680000256e-08
Finished after 855044 iterations
(169,)
[ 3.72809789e-01 -1.07102977e+00 -2.39924377e-01 -5.76377559e-03
 -1.98103660e-02 -1.18542503e-02 -1.70819038e-02  1.49976200e-02
  6.69383237e-04 -5.40080692e-02  5.72548214e-02  2.92507071e-02
  2.24754504e-02  1.60089718e-02 -4.82267272e-02  5.04044155e-02
  3.20652671e-02 -5.73852376e-02 -4.75038709e-02  5.05689071e-02
  6.52013150e-02  3.96392471e-02 -3.98711261e-03  3.37853261e-02
 -3.17506811e-02 -2.71

In [7]:
# Compute MSE on training set
y_closed_train = np.matmul(X_train, w_closed)
mse_closed_train = np.sum((y_closed_train - y_train)**2)/len(y_train)
print(mse_closed_train)

y_grad_train = np.matmul(X_train, w_grad)
mse_grad_train = np.sum((y_grad_train - y_train)**2)/len(y_train)
print(mse_grad_train)

1.0463371860983777
1.0463422161596414


In [8]:
# Compute MSE on validation set
y_closed_valid = np.matmul(X_valid, w_closed)
mse_closed_valid = np.sum((y_closed_valid - y_valid)**2)/len(y_valid)
print(mse_closed_valid)

y_grad_valid = np.matmul(X_valid, w_grad)
mse_grad_valid = np.sum((y_grad_valid - y_valid)**2)/len(y_valid)
print(mse_grad_valid)

0.9942425147893313
0.9942508493741995
