In [3]:
import numpy as np
from sklearn.datasets import load_boston

In [4]:
#type - Bunch
#Bunch - dictionary of the data
boston = load_boston()

In [5]:
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [6]:
X = boston.data
y = boston.target

In [7]:
#==========STEP 1 : Get X,y in right shape

m, n = X.shape

In [8]:
#num of rows in X as same as num of rows in y
assert m == y.shape[0]

In [9]:
#standardize data
#mean is 0, varience = 1
#why
#cause allows us to reach convergence faster
#why -> because the values are within smaller range
#Thus, the gradients are also within limited range, and NOT go crazy

from sklearn.preprocessing import StandardScaler

#======STEP 2 : Almost always, feature scale your data

#create object 
#StandardScaler = class
scaler = StandardScaler()

# print(X[:,5])
X = scaler.fit_transform(X)
# print(X[:,5])
# print(X.mean(axis=0))

In [10]:
#=======STEP 3 : Train test split your data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

assert len(X_train) == len(y_train)
assert len(X_test) == len(y_test)

##### Closed from

$(X^TX)^{-1} X^TY$

In [11]:
#How to get closed form
#(XTX)^-1 X^TY
#Simple; Set the d(cost function) = 0
#And find the \theta that satisfy the equation
#When we can do such a thing in which we set the d(cost function) = 0
#--->When its strictly convave, or strictly convex
#---->They have only one local maximum (concave), minimum (convex)
#=====STEP 4: Do your computations

from numpy.linalg import inv

def closed_form(X, y):
    return inv(X.T @ X) @ X.T @ y

In [12]:
#What is the shape of X they want
#(num of sample, num of features) -----> correct shape
#(num of sample, num of sample)
#for the closed form formula
#How about the intercept

intercept = np.ones((X_train.shape[0], 1))
X_train = np.concatenate((intercept, X_train), axis=1)
print(type(X_train))
intercept = np.ones((X_test.shape[0], 1))
X_test = np.concatenate((intercept, X_test), axis=1)

<class 'numpy.ndarray'>


In [13]:
#=====STEP 5: Find the theta/weights/beta

w = closed_form(X_train, y_train)

In [14]:
w.shape

(14,)

In [15]:
#==== STEP 6 : compute the accuracy/loss

#6.1 predict ---> \theta^T X
yhat = X_test @ w

#if i want to compare yhat and y,i need to make sure thay are the same shape
assert y_test.shape == yhat.shape

In [16]:
#6.2 get the errors
errors = ((y_test - yhat)**2).sum()

In [17]:
print(errors)

2883.3450334241084


In [18]:
X_train.shape

(354, 14)

In [19]:
y_train.shape

(354,)

In [20]:
#prepare the data

#X_train,X_test have intercepts

assert X_train.shape[0] == y_train.shape[0]

w = np.zeros(X_train.shape[1])
# print(w.shape)

alpha = 0.00001
max_iter = 2000  #==> typical to call its epochs
loss_old = 10000
tol = 0.01
iter_stop = 0

#define your for loop
for i in range(max_iter):
#     1.yhat  = X@w
#     prediction
#     yhat () = (m,n)@(n, )
    yhat = X_train@w
    
#     2.error = yhat-y_train
#     error for use to calculate gradients
#     error () = (m, ) - (m, )
    error = yhat - y_train

#     2.1.early stopping
#     so we dont go through all max_iter iterations
#     (yi_hat - yi) ^2
#     loss_new(scalar) = (((m, ) - (m, ))**2/m).sum()
    loss_new = ((yhat - y_train)**2/yhat.shape[0]).sum() #<<<<mean square error
    if np.abs (loss_new-loss_old)<tol: #np.allclose
        iter_stop = i
        break
    loss_old = loss_new
    
#     3. grad = X.T@error
#     grad(n, ) = (n,m) @ (m,)
#     grad for each feature 
    grad = X_train.T@error
    
#     4. w = w-alpha*grad   
#     update w
#     #w (n, ) = (n, ) -scalar*(n, )
    w = w-alpha*grad

# we got our lovely w
# now its time to check our accuracy
#1. make prediction
yhat=X_test@w 
#2. calculate our mean square error
mse = ((yhat - y_test)**2/yhat.shape[0]).sum()

print('MSE: ', mse)
print("Stop at iteration: ", iter_stop)

MSE:  20.63701197734828
Stop at iteration:  866


In [21]:
# Stochastic  # better with decaying learning rate -> decaying alpha, so that the loss doesnt jump 
#prepare the data

#X_train,X_test have intercepts

assert X_train.shape[0] == y_train.shape[0]

w = np.zeros(X_train.shape[1])
# print(w.shape)

alpha = 0.00001
max_iter = 2000  #==> typical to call its epochs
loss_old = 10000
tol = 0.000001
iter_stop = 0
# idx = np.arange(0,X_train.shape[0],1)
# X_train.shape
# np.random.shuffle(idx)
loss = 0

#define your for loop
for i in range(max_iter):
    idx = np.arange(0,X_train.shape[0],1)
    np.random.shuffle(idx)
    for j in idx:
        yhat = X_train[j,:].reshape(1,-1)@w
        error = yhat - y_train[j]
        loss = ((yhat - y_train[j])**2)#.sum()
        loss += loss
        loss_new = loss/y_train.shape[0]
        if np.abs (loss_new-loss_old)<tol: #np.allclose
            iter_stop = i
            break
        loss_old = loss_new
        grad = X_train[j,:]*error
        w = w-alpha*grad 


yhat=X_test@w 
mse = ((yhat - y_test)**2/yhat.shape[0]).sum()
# mse = (yhat - y_test)**2

print('MSE: ', mse)
print("Stop at iteration: ", iter_stop)

MSE:  19.59057165757282
Stop at iteration:  1964


In [22]:
# def get_yhat(X,w):
#     return X@w

# def get_mse(yhat,y):
#     return ((yhat - y_train)**2/yhat.shape[0]).sum()

# def delta_loss(new,old,tol):
#     return np.abs (loss_new-loss_old)<tol
    
# def gradient(X,error):
#     return X.T@error

# max_epochs = 1000

# def h_theta_x (X,w):
    
#     ''' 
#     Xshape(m,n)
#     wshape(n, )
#     return(m, )
#     '''
#     return X@w

# for epoch in range (max_epochs): # epochs max_iters
#     idx = np.arange(0,X_train.shape[0],1)
#     np.random.shuffle(idx)
    
#     for i in idx:
# #     1. yhat = X_i @w
#         X_i = X_train[i,:]
# #     prediction
# #     yhat () = (1,n) @ (n, )
#         yhat = getyhat(X_i,w)
    
# #     2.error = yhat - y_i
# #     y_i (1, )
#         y_i = y_train[i]
# #     error = (1, )
#         error = yhat - y_i
    
    