In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import regression as reg
import matplotlib.pyplot as plt

In [2]:
is_terminated = True
is_bench_mode = True

root = dict()

In [3]:
def load_data():
    df = pd.read_csv('../data/input/kc_house_data_cleaned.csv')

    X = df[df.columns[range(15)]].to_numpy()
    one = np.ones((X.shape[0], 1))
    X = np.concatenate((one, X), axis=1)
    y = df['price'].to_numpy()
    # y = (y - np.min(y)) / (np.max(y) - np.min(y))
    y = y/1e6
    y = y.reshape((X.shape[0], 1))

    return train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
X_train, X_test, y_train, y_test = load_data()
print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'X_test: {X_test.shape}, y_train: {y_test.shape}')

X_train: (17277, 16), y_train: (17277, 1)
X_test: (4320, 16), y_train: (4320, 1)


In [5]:
print(X_train)

[[1.         0.09375    0.2        ... 0.00393889 0.37068966 0.01234568]
 [1.         0.0625     0.16666667 ... 0.01060136 0.31034483 0.01234568]
 [1.         0.09375    0.3        ... 0.0049957  0.04310345 0.01234568]
 ...
 [1.         0.09375    0.13333333 ... 0.00901615 0.3362069  0.01234568]
 [1.         0.         0.03333333 ... 0.0164827  0.44827586 0.01234568]
 [1.         0.09375    0.26666667 ... 0.01399117 0.12931034 0.01234568]]


In [6]:
import matplotlib.pyplot as plt

def plot_data(lm: reg.RegressionOpt):
    plt.plot(range(len(lm.loss_func_list)), lm.loss_func_list)
    plt.title(f'Loss function', fontsize=16)
    plt.ylabel('Value', fontsize=16)
    plt.xlabel('Count', fontsize=16)
    plt.show()

    plt.plot(range(len(lm.grad_norm_list)), lm.grad_norm_list)
    plt.title(f'Gradient norm', fontsize=16)
    plt.ylabel('Value', fontsize=16)
    plt.xlabel('Count', fontsize=16)
    plt.show()

In [7]:
def save_data(lm: reg.RegressionOpt):
    d = {
        'loss_func_list': lm.loss_func_list,
        'grad_norm_list': lm.grad_norm_list
    }

    df = pd.DataFrame(data=d)

    file_name = 'lf_n_grn_' + str(lm.solver) + '_' + str(lm.step_size) + '.csv'
    df.to_csv('../data/output/' + file_name, index=False)

In [8]:
w_init = np.repeat(0, X_train.shape[1]).reshape((X_train.shape[1], 1))

GD method

In [27]:
import time

start = time.time()

lm = reg.RegressionOpt(
    solver='gd',
    max_iter=10000,
    w=w_init,
    step_size=1,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated,
    bench_mode=is_bench_mode
)

w = lm.fit_gd()

end = time.time()

print(f'gd time: {end - start}')
# plot_data(lm)

# save_data(lm)

print(f'count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss func: {lm.loss_func_list[-1]}')
# print(lm.w)

root['gd'] = np.array(lm.w)


gd time: 2.642338514328003
count: 4358, gradient norm: 0.7832325359150619, loss func: 0.21468643959334224


In [10]:
print(f'root: {root}')

root: {'gd': array([[-0.56814636],
       [-1.18578792],
       [ 0.36570304],
       [ 0.7777802 ],
       [-0.11424952],
       [ 0.07692482],
       [ 0.58113749],
       [ 0.17237658],
       [ 0.07647384],
       [ 1.23115306],
       [ 0.86055422],
       [ 0.511194  ],
       [ 0.16453468],
       [-0.44460732],
       [ 0.43438789],
       [-0.07629955]])}


Newton method

In [29]:
import time

start = time.time()

lm = reg.RegressionOpt(
    solver='newton',
    max_iter=20,
    step_size=1,
    w=w_init,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated
)

lm.fit_newton()

end = time.time()

print(f'time: {end - start}')

# plot_data(lm)
# save_data(lm)

print(f'count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss func: {lm.loss_func_list[-1]}')
print(f'w: {lm.w}')

root['newton'] = np.array(lm.w)

time: 0.0048389434814453125
count: 2, gradient norm: 4.517056224608494e-15, loss func: 0.023211385950436696
w: [[-5.56459985e-01]
 [-1.39628217e+00]
 [ 3.79679695e-01]
 [ 1.26542408e+02]
 [-4.33284961e-02]
 [ 7.55194870e-02]
 [ 5.78605578e-01]
 [ 1.70338808e-01]
 [ 7.75740426e-02]
 [ 1.21555499e+00]
 [-8.54204659e+01]
 [-4.54944179e+01]
 [ 1.63877272e-01]
 [-5.22163597e-01]
 [ 4.35033258e-01]
 [-7.88868835e-02]]


Accelerated GD method

In [33]:
import regression as reg

import time
start = time.time()

lm = reg.RegressionOpt(
    solver='agd',
    max_iter=10000,
    step_size=0.6,
    w=w_init,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated,
    bench_mode=is_bench_mode
)

lm.fit_acc_gd()

end = time.time()

print(f'time: {end - start}')

# plot_data(lm)
# save_data(lm)

print(f'count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss func: {lm.loss_func_list[-1]}')
print(lm.w)

root['agd'] = np.array(lm.w)

time: 0.17490839958190918
count: 278, gradient norm: 0.7832325359150619, loss func: 0.21468643959334224
[[-0.55187398]
 [-1.55811182]
 [ 0.39201815]
 [ 0.8169481 ]
 [-0.08525583]
 [ 0.07323636]
 [ 0.57983243]
 [ 0.168161  ]
 [ 0.07806533]
 [ 1.21997316]
 [ 0.90635672]
 [ 0.53231155]
 [ 0.15382112]
 [-0.49920104]
 [ 0.43691492]
 [-0.08442615]]


In [13]:
# delta = 1e-14
# t0 = 2e-12
# t = []
# for i in range(100):
#     t.append(t0 + i * delta)
#
# for step in t:
#     lm = reg.RegressionOpt(
#     solver='gd',
#     max_iter=500,
#     step_size=step,
#     w=w_init,
#     X_train=X_train,
#     y_train=y_train,
#     X_test=X_test,
#     y_test=y_test,
#     terminate=True
#     )
#
#     lm.fit_gd()
#
#     print(f'step size: {step}, count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss function: {lm.loss_func_list[-1]}')


GD with backtracking

In [36]:
import regression as reg

import time

start = time.time()

lm = reg.RegressionOpt(
    solver='bgd',
    backtracking=True,
    max_iter=10000,
    step_size=128,
    alpha=0.5,
    beta=0.5,
    w=w_init,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated,
    bench_mode=is_bench_mode
)

lm.fit_gd()

end = time.time()

print(f'gd time: {end - start}')

# plot_data(lm)
# save_data(lm)

print(f'count: {lm.count}, inner_count: {lm.inner_count}, gradient norm: {lm.grad_norm_list[-1]}, loss func: {lm.loss_func_list[-1]}')
# print(lm.w)

root['bgd'] = np.array(lm.w)

gd time: 1.0022284984588623
count: 286, inner_count: 1562, gradient norm: 0.7832325359150619, loss func: 0.21468643959334224


Accelerated GD with backtracking

In [15]:
# import regression as reg
#
# t = [4e-13]
# for i in range(25):
#     t.append(1.01 * t[i])
#
# # print(t)
#
# for step in t:
#     lm = reg.RegressionOpt(
#         backtracking=True,
#         max_iter=500,
#         step_size=step,
#         alpha=0.5,
#         beta=0.5,
#         w=w_init,
#         X_train=X_train,
#         y_train=y_train,
#         X_test=X_test,
#         y_test=y_test,
#         terminate=True
#     )
#
#     lm.fit_acc_gd()
#     # plot_data(lm)
#
#     print(f'step size: {step}, count: {lm.count}, gradient norm: {lm.grad_norm_list[-1]}, loss function: {lm.loss_func_list[-1]}')

In [41]:
import time

start = time.time()

lm = reg.RegressionOpt(
    solver='agd_bt',
    backtracking=True,
    max_iter=10000,
    step_size=1,
    alpha=0.5,
    beta=0.5,
    w=w_init,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    terminate=is_terminated
)

lm.fit_acc_gd()

end = time.time()

print(f'gd time: {end - start}')

# plot_data(lm)
# save_data(lm)

print(f'step size: {lm.step_size}, count: {lm.count}, inner_count: {lm.inner_count}, gradient norm: {lm.grad_norm_list[-1]}, loss function: {lm.loss_func_list[-1]}')

root['agd_bt'] = np.array(lm.w)

gd time: 0.5875453948974609
step size: 1, count: 307, inner_count: 138, gradient norm: 9.836391599532262e-05, loss function: 0.02321760553991592


In [17]:
# delta = 1e-11
# t0 = 6.251678e-4
# t = []
# for i in range(50):
#     t.append(t0 + i * delta)
# loop = 0
# for step in t:
#     loop += 1
#     lm = reg.RegressionOpt(
#     solver='agd_bt',
#     backtracking=True,
#     max_iter=50,
#     step_size=step,
#     alpha=0.5,
#     beta=0.5,
#     w=w_init,
#     X_train=X_train,
#     y_train=y_train,
#     X_test=X_test,
#     y_test=y_test,
#     terminate=True
#     )
#
#     lm.fit_acc_gd()
#     # plot_data(lm)
#     # save_data(lm)
#
#     print(f'loop: {loop}, step size: {lm.step_size}, count: {lm.count}, inner_count: {lm.inner_count}, gradient norm: {lm.grad_norm_list[-1]}, loss function: {lm.loss_func_list[-1]}')

In [18]:
# w = np.linalg.solve(np.dot(X_train.T, X_train), np.dot(X_train.T, y_train))
# print(w)

In [43]:
from sklearn.linear_model import LinearRegression

import time

start = time.time()
lm = LinearRegression()
lm.fit(X_train, y_train)
end = time.time()

print(f'time: {end - start}')
# lm.coef_
print(f'train score: {lm.score(X_train, y_train)}')
print(f'test score: {lm.score(X_test, y_test)}')

time: 0.02147078514099121
train score: 0.658982195856788
test score: 0.6352020058657151


In [20]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train, y_train)
# lm.coef_
print(f'train score: {lm.score(X_train, y_train)}')
print(f'test score: {lm.score(X_test, y_test)}')

train score: 0.658982195856788
test score: 0.6352020058657151


In [21]:
# norm = np.linalg.norm(root['newton'])

# diff = dict()

# for key in root:
#     d = root[key] - root['newton']
#     diff[key] = np.linalg.norm(d) / 15

# print(f'diff: {diff}')

In [22]:
print(root)

{'gd': array([[-0.56814636],
       [-1.18578792],
       [ 0.36570304],
       [ 0.7777802 ],
       [-0.11424952],
       [ 0.07692482],
       [ 0.58113749],
       [ 0.17237658],
       [ 0.07647384],
       [ 1.23115306],
       [ 0.86055422],
       [ 0.511194  ],
       [ 0.16453468],
       [-0.44460732],
       [ 0.43438789],
       [-0.07629955]]), 'newton': array([[-5.56459985e-01],
       [-1.39628217e+00],
       [ 3.79679695e-01],
       [ 1.26542408e+02],
       [-4.33284961e-02],
       [ 7.55194870e-02],
       [ 5.78605578e-01],
       [ 1.70338808e-01],
       [ 7.75740426e-02],
       [ 1.21555499e+00],
       [-8.54204659e+01],
       [-4.54944179e+01],
       [ 1.63877272e-01],
       [-5.22163597e-01],
       [ 4.35033258e-01],
       [-7.88868835e-02]]), 'agd': array([[-0.55187398],
       [-1.55811182],
       [ 0.39201815],
       [ 0.8169481 ],
       [-0.08525583],
       [ 0.07323636],
       [ 0.57983243],
       [ 0.168161  ],
       [ 0.07806533],
      

In [23]:
from sklearn.metrics import r2_score

r2 = dict()
for key in root:
    r2[key] = r2_score(y_test, np.dot(X_test, root[key]))

print(f'r2: {r2}')

r2: {'gd': 0.6359931596812467, 'newton': 0.6351979911955383, 'agd': 0.6342705837804722, 'bgd': 0.6358671282890946, 'agd_bt': 0.6343423740136478}


In [24]:
np.linalg.solve()

TypeError: _solve_dispatcher() missing 2 required positional arguments: 'a' and 'b'