In [1]:
import time

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import regression as reg
import matplotlib.pyplot as plt

In [2]:
is_terminated = True
bench_mode = True

In [3]:
def load_data():
    df = pd.read_csv('../data/input/kc_house_data_cleaned.csv')

    X = df[df.columns[range(15)]].to_numpy()
    one = np.ones((X.shape[0], 1))
    X = np.concatenate((one, X), axis=1)
    y = df['price'].to_numpy()
    # y = (y - np.min(y)) / (np.max(y) - np.min(y))
    y = y/1e6
    y = y.reshape((X.shape[0], 1))

    return train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
X_train, X_test, y_train, y_test = load_data()
print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}, y_train: {y_test.shape}')

X_train: (17277, 16), y_train: (17277, 1)
X_test: (4320, 16), y_train: (4320, 1)


In [5]:
import matplotlib.pyplot as plt

def plot_data(lm: reg.RegressionOpt):
    plt.plot(range(len(lm.loss_func_list)), lm.loss_func_list)
    plt.title(f'Loss function', fontsize=16)
    plt.ylabel('Value', fontsize=16)
    plt.xlabel('Count', fontsize=16)
    plt.show()

    plt.plot(range(len(lm.grad_norm_list)), lm.grad_norm_list)
    plt.title(f'Gradient norm', fontsize=16)
    plt.ylabel('Value', fontsize=16)
    plt.xlabel('Count', fontsize=16)
    plt.show()

In [6]:
def save_data(lm: reg.RegressionOpt):
    d = {
        'loss_func_list': lm.loss_func_list,
        'grad_norm_list': lm.grad_norm_list
    }

    df = pd.DataFrame(data=d)

    file_name = 'lf_n_grn_' + str(lm.solver) + '_' + str(lm.step_size) + '.csv'
    df.to_csv('../data/output/' + file_name, index=False)

In [7]:
w_init = np.repeat(0, X_train.shape[1]).reshape((X_train.shape[1], 1))

GD method

In [8]:
start = time.time()

lm = reg.RegressionOpt(
    solver='gd',
    max_iter=10000,
    w=w_init,
    step_size=1,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated,
    bench_mode=bench_mode
)

w = lm.fit_gd()

end = time.time()

# plot_data(lm)

# save_data(lm)
print(f'time: {end - start}')
print(f'count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss func: {lm.loss_func_list[-1]}')
# print(lm.w)

time: 0.2704741954803467
count: 4358, gradient norm: 0.7832325359150619, loss func: 0.21468643959334224


Newton method

In [18]:
start = time.time()

lm = reg.RegressionOpt(
    solver='newton',
    max_iter=10000,
    step_size=1,
    w=w_init,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated,
    bench_mode=bench_mode
)

lm.fit_newton()

end = time.time()

# plot_data(lm)
# save_data(lm)

print(f'time: {end - start}')
print(f'count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss func: {lm.loss_func_list[-1]}')

time: 134.07388353347778
count: 2, gradient norm: 0.7832325359150619, loss func: 0.21468643959334224


Accelerated GD method

In [10]:
start = time.time()

lm = reg.RegressionOpt(
    solver='agd',
    max_iter=10000,
    step_size=0.6,
    w=w_init,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated,
    bench_mode=bench_mode
)

lm.fit_acc_gd()

end = time.time()

# plot_data(lm)
# save_data(lm)

print(f'time: {end - start}')
print(f'count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss func: {lm.loss_func_list[-1]}')
# print(lm.w)

time: 0.013960599899291992
count: 278, gradient norm: 0.7832325359150619, loss func: 0.21468643959334224


In [11]:
# delta = 1e-14
# t0 = 2e-12
# t = []
# for i in range(100):
#     t.append(t0 + i * delta)
#
# for step in t:
#     lm = reg.RegressionOpt(
#     solver='gd',
#     max_iter=500,
#     step_size=step,
#     w=w_init,
#     X_train=X_train,
#     y_train=y_train,
#     X_test=X_test,
#     y_test=y_test,
#     terminate=True
#     )
#
#     lm.fit_gd()
#
#     print(f'step size: {step}, count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss function: {lm.loss_func_list[-1]}')


GD with backtracking

In [12]:
start = time.time()

lm = reg.RegressionOpt(
    solver='bgd',
    backtracking=True,
    max_iter=10000,
    step_size=128,
    alpha=0.5,
    beta=0.5,
    w=w_init,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated
)

lm.fit_gd()

end = time.time()

# plot_data(lm)
# save_data(lm)

print(f'time: {end - start}')
print(f'count: {lm.count}, inner_count: {lm.inner_count}, gradient norm: {lm.grad_norm_list[-1]}, loss func: {lm.loss_func_list[-1]}')
# print(lm.w)

time: 1.631587266921997
count: 286, inner_count: 1562, gradient norm: 9.868312630891296e-05, loss func: 0.023221719759706813


Accelerated GD with backtracking

In [13]:
# import regression as reg
#
# t = [4e-13]
# for i in range(25):
#     t.append(1.01 * t[i])
#
# # print(t)
#
# for step in t:
#     lm = reg.RegressionOpt(
#         backtracking=True,
#         max_iter=500,
#         step_size=step,
#         alpha=0.5,
#         beta=0.5,
#         w=w_init,
#         X_train=X_train,
#         y_train=y_train,
#         X_test=X_test,
#         y_test=y_test,
#         terminate=True
#     )
#
#     lm.fit_acc_gd()
#     # plot_data(lm)
#
#     print(f'step size: {step}, count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss function: {lm.loss_func_list[-1]}')

In [14]:
start = time.time()

lm = reg.RegressionOpt(
    solver='agd_bt',
    backtracking=True,
    max_iter=10000,
    step_size=3,
    alpha=0.5,
    beta=0.5,
    w=w_init,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated,
    bench_mode=bench_mode
)

lm.fit_acc_gd()

end = time.time()

# plot_data(lm)
# save_data(lm)

print(f'time: {end - start}')
print(f'step size: {lm.step_size}, count: {lm.count}, inner_count: {lm.inner_count}, gradient norm: {lm.grad_norm_list[-1]}, loss function: {lm.loss_func_list[-1]}')

time: 0.39382386207580566
step size: 3, count: 161, inner_count: 314, gradient norm: 0.7832325359150619, loss function: 0.21468643959334224


In [15]:
w = np.linalg.solve(np.dot(X_train.T, X_train), np.dot(X_train.T, y_train))
print(w)

[[-5.56459985e-01]
 [-1.39628217e+00]
 [ 3.79679695e-01]
 [-2.86891388e+02]
 [-4.33284961e-02]
 [ 7.55194870e-02]
 [ 5.78605578e-01]
 [ 1.70338808e-01]
 [ 7.75740426e-02]
 [ 1.21555499e+00]
 [ 1.98364007e+02]
 [ 1.05815445e+02]
 [ 1.63877272e-01]
 [-5.22163597e-01]
 [ 4.35033258e-01]
 [-7.88868835e-02]]


In [16]:
from sklearn.linear_model import LinearRegression

start = time.time()
lm = LinearRegression()
lm.fit(X_train, y_train)
end = time.time()
# lm.coef_
print(f'time: {end - start}')
print(f'train score: {lm.score(X_train, y_train)}')
print(f'test score: {lm.score(X_test, y_test)}')

time: 0.049143075942993164
train score: 0.658982195856788
test score: 0.6352020058657151
