In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import numpy.linalg
import itertools

In [2]:
link = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00477/Real%20estate%20valuation%20data%20set.xlsx'
names = ['house age', 'distance', 'n_convenience stores', 'latitude', 'longitude', 'house price']
df = pd.read_excel(link,usecols = [2,3,4,5,6,7], names=names)

In [3]:
df

Unnamed: 0,house age,distance,n_convenience stores,latitude,longitude,house price
0,32.0,84.87882,10,24.98298,121.54024,37.9
1,19.5,306.59470,9,24.98034,121.53951,42.2
2,13.3,561.98450,5,24.98746,121.54391,47.3
3,13.3,561.98450,5,24.98746,121.54391,54.8
4,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...
409,13.7,4082.01500,0,24.94155,121.50381,15.4
410,5.6,90.45606,9,24.97433,121.54310,50.0
411,18.8,390.96960,7,24.97923,121.53986,40.6
412,8.1,104.81010,5,24.96674,121.54067,52.5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   house age             414 non-null    float64
 1   distance              414 non-null    float64
 2   n_convenience stores  414 non-null    int64  
 3   latitude              414 non-null    float64
 4   longitude             414 non-null    float64
 5   house price           414 non-null    float64
dtypes: float64(5), int64(1)
memory usage: 19.5 KB


In [5]:
df.describe()

Unnamed: 0,house age,distance,n_convenience stores,latitude,longitude,house price
count,414.0,414.0,414.0,414.0,414.0,414.0
mean,17.71256,1083.885689,4.094203,24.96903,121.533361,37.980193
std,11.392485,1262.109595,2.945562,0.01241,0.015347,13.606488
min,0.0,23.38284,0.0,24.93207,121.47353,7.6
25%,9.025,289.3248,1.0,24.963,121.528085,27.7
50%,16.1,492.2313,4.0,24.9711,121.53863,38.45
75%,28.15,1454.279,6.0,24.977455,121.543305,46.6
max,43.8,6488.021,10.0,25.01459,121.56627,117.5


In [6]:
df.head()

Unnamed: 0,house age,distance,n_convenience stores,latitude,longitude,house price
0,32.0,84.87882,10,24.98298,121.54024,37.9
1,19.5,306.5947,9,24.98034,121.53951,42.2
2,13.3,561.9845,5,24.98746,121.54391,47.3
3,13.3,561.9845,5,24.98746,121.54391,54.8
4,5.0,390.5684,5,24.97937,121.54245,43.1


In [7]:
!pip install pandas-profiling==2.7.1
from pandas_profiling import ProfileReport
profile = ProfileReport(df)



In [8]:
profile

Summarize dataset:   0%|          | 0/18 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [9]:
#profile.to_notebook_iframe()
#profile.to_file("your_report.html")

In [10]:
X = df.iloc[:,[0,1,2,3,4]]
Y = df.iloc[:,[5]]

# Spliting train and test data

In [11]:
def split_train_test(data, test_ratio):
    np.random.seed(43)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]
#data.iloc[train_indices].reset_index(drop=True, inplace=True)
train_set, test_set = split_train_test(df, 0.2)

In [12]:
X_train = train_set.copy(deep = True).iloc[:,[0,1,2,3,4]].reset_index(drop = True)
Y_train = train_set.copy(deep = True).iloc[:,[5]].reset_index(drop = True)
X_test = test_set.copy(deep = True).iloc[:,[0,1,2,3,4]].reset_index(drop = True)
Y_test = test_set.copy(deep = True).iloc[:,[5]].reset_index(drop = True)

#Gaussian Radial Basis Functions

In [13]:
def feature_scaling(train_data,test_data):
    mean = train_data.mean()
    std = train_data.std()

    train_data = (train_data - mean) / std
    test_data = (test_data - mean) / std
    
    return train_data, test_data

X_train, X_test = feature_scaling(X_train, X_test)

In [14]:
def get_centroids(X, b):
    # Find the indeces
    np.random.seed(43)
    idx = np.random.randint(np.size(X, axis=0), size= b)
    # Use indeces to grab rows
    return(X.iloc[list(idx), :]) # b by d

In [15]:
def compute_phi(X, centroids, sigmasq):
    # X is the matrix to be transformed. b is the number of centroids
    n = X.shape[0]
    b = centroids.shape[0]
    # now slowly construct the matrix by doing a double loop
    values = []
    for x in X.values:
        for c in centroids.values:
            values.append(np.exp(-np.sum((x - c)**2) / (2*sigmasq)))
    # now simply reshape it
    phi_X = np.concatenate((np.ones((n,1)),np.reshape(values, (n, b))), axis = 1) # n by b+1 matrix
    return phi_X

In [16]:
idx = np.random.randint(np.size(X_train, axis=0), size= 5)
compute_phi(X_train, X_train.iloc[list(idx), :] , 1)

array([[1.00000000e+00, 1.44418498e-01, 2.13833329e-01, 3.29981032e-01,
        2.78428662e-02, 1.44418498e-01],
       [1.00000000e+00, 4.25622523e-01, 2.41278763e-02, 1.21648715e-02,
        5.29170099e-03, 4.25622523e-01],
       [1.00000000e+00, 4.62952795e-03, 3.24323369e-03, 3.99242089e-03,
        1.01414409e-03, 4.62952795e-03],
       ...,
       [1.00000000e+00, 2.32545696e-10, 2.46545600e-04, 3.25474309e-07,
        3.78730363e-03, 2.32545696e-10],
       [1.00000000e+00, 2.20786976e-07, 4.15089879e-03, 2.15113823e-05,
        2.45677743e-02, 2.20786976e-07],
       [1.00000000e+00, 2.27089404e-02, 7.46336413e-01, 1.29062444e-01,
        4.67768734e-01, 2.27089404e-02]])

In [17]:
def pred_RBF(X, w, centroids, sigmasq):
    phi_X_test = compute_phi(X, centroids, sigmasq)
    return np.dot(phi_X_test, w)

# Linear Regression Method.
# Closed Form

In [18]:
#LR: Linear Regression
#CF: Closed Form
def train_lr_cf(x,y):
    """
		Build the linear least weight vector W
		:param x: N x D matrix containing N attributes vectors for training
		:param y: N x 1 matrix containing N class vectors for training
	"""
    # n : Number of samples
    # d : Number of features
    n, d = x.shape
    x_new = np.concatenate((np.ones((n,1)), x), axis = 1)

    xtx = np.dot(x_new.T,x_new)
    xty = np.dot(x_new.T,y)
    # Return weight vector
    w = np.dot(np.linalg.inv(xtx),xty)
    return w
def predict(x, w):
    n, d = x.shape
    
    x_new = np.concatenate((np.ones((n,1)), x.copy(deep = True)), axis = 1)	
    y_hat = np.dot(x_new, w)

    return y_hat 

def computing_rmse(y, y_hat):
    e = np.sqrt(np.square(np.subtract(y,y_hat)).mean())
    return float(e)

In [19]:
w = train_lr_cf(X_train, Y_train)
Y_pred_lr_cf_train = predict(X_train, w)
print(computing_rmse(Y_train, Y_pred_lr_cf_train))

Y_pred_lr_cf_test = predict(X_test, w)
print(computing_rmse(Y_test, Y_pred_lr_cf_test))

8.052719178137517
11.845979863291463


# Ridge Regression Method
# Closed Form

In [20]:
#RR: ridge Regression
def train_rr_cf(x,y, alpha=0.1):
    n, d = x.shape 
    # x0 = 1 
    x_new = np.concatenate((np.ones((n,1)), x), axis = 1)
    A = np.identity(x_new.shape[1]) # A is a d+1 by d+1 identity matrix
    A[0,0] = 0 # bias term should be 0
    xtx = np.dot(x_new.T,x_new)
    aA = np.multiply(alpha, A)
    xty = np.dot(x_new.T,y)
    w = np.dot(np.linalg.inv(np.add(xtx,aA)),xty)
    return w

In [21]:
rmse_rr_cf_train = []
rmse_rr_cf_test = []

for i in np.linspace(0.01, 0.3, 5):
    w = train_rr_cf(X_train, Y_train, i)
    Y_pred_rr_cf_train = predict(X_train, w)
    rmse_rr_cf_train.append(Y_pred_rr_cf_train)
    print("train rmse for alpha =", i , "is:", computing_rmse(Y_train, Y_pred_rr_cf_train))

    Y_pred_rr_cf_test = predict(X_test, w)
    rmse_rr_cf_test.append(Y_pred_rr_cf_test)
    print("test rmse for alpha =", i , "is:", computing_rmse(Y_test, Y_pred_rr_cf_test))
    print('-'*25)


train rmse for alpha = 0.01 is: 8.05271918156132
test rmse for alpha = 0.01 is: 11.845989879186828
-------------------------
train rmse for alpha = 0.08249999999999999 is: 8.052719410830695
test rmse for alpha = 0.08249999999999999 is: 11.846062305039666
-------------------------
train rmse for alpha = 0.155 is: 8.052719998315135
test rmse for alpha = 0.155 is: 11.846134399301652
-------------------------
train rmse for alpha = 0.22749999999999998 is: 8.052720942458217
test rmse for alpha = 0.22749999999999998 is: 11.846206163753141
-------------------------
train rmse for alpha = 0.3 is: 8.05272224171511
test rmse for alpha = 0.3 is: 11.846277600165461
-------------------------


# Ridge with Gradient Descent

In [22]:
def train_rr_gd(x, y, alpha=0.2, lr=0.05, epoch=150):
    """
		Use an iterative method to get best weights for ridge regression.
		:param x: N x D matrix containing N attributes vectors for training
		:param y: N x 1 matrix containing N class vectors for training
	"""
    # n : Number of samples
    # d : Number of features
    n, d = x.shape
    x_new = np.concatenate((np.ones((n,1)), x), axis = 1)
    
    w = np.random.randn(d+1,1)
    gradients = np.zeros((d,1)) 
    
    for _ in range(epoch):
        gradients = 2/n * (x_new.T.dot(x_new.dot(w) - y))
        gradients[1:] += np.multiply(2/n, np.multiply(alpha, w[1:]))
        w = w - lr * gradients
    return w

In [23]:
rmse_rr_gd_train = []
rmse_rr_gd_test = []

lrs = np.linspace(0.01, 0.3, 5)
alphas = np.linspace(0.1, 0.4, 4)
for lr in lrs:
    for alpha in alphas:
        w = train_rr_gd(X_train, Y_train, alpha = alpha, lr = lr , epoch = 150).reshape((-1,1))
        Y_pred_rr_gd_train = predict(X_train, w)
        rmse_rr_gd_train.append(Y_pred_rr_gd_train)
        print("train rmse for learning rate =", lr , "and alpha", alpha, " is:", computing_rmse(Y_train, Y_pred_rr_gd_train))

        Y_pred_rr_gd_test = predict(X_test, w)
        rmse_rr_gd_test.append(Y_pred_rr_gd_test)
        print("test rmse for learning rate =", lr , "and alpha", alpha, " is:", computing_rmse(Y_test, Y_pred_rr_gd_test))
        print('-'*90)


train rmse for learning rate = 0.01 and alpha 0.1  is: 8.269948254273638
test rmse for learning rate = 0.01 and alpha 0.1  is: 11.945538495042321
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.01 and alpha 0.2  is: 8.270010849702045
test rmse for learning rate = 0.01 and alpha 0.2  is: 12.010594429079356
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.01 and alpha 0.30000000000000004  is: 8.289611026924645
test rmse for learning rate = 0.01 and alpha 0.30000000000000004  is: 11.971163898857556
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.01 and alpha 0.4  is: 8.2654328219058
test rmse for learning rate = 0.01 and alpha 0.4  is: 12.010104863872847
------------------------------------------------------------------------------------------
train rmse for learnin

# Lasso Regression with Gradient Descent

In [24]:
def train_lasso_gd(x, y, alpha, lr, epoch=150):
    """
        Use an iterative method to get best weights for lasso regression.

		:param x: N x D matrix containing N attributes vectors for training
		:param y: N x 1 matrix containing N class vectors for training
	"""
    # n : Number of samples
    # d : Number of features
    n, d = x.shape
    x_new = np.concatenate((np.ones((n,1)), x), axis = 1)
    
    w = np.random.randn(d+1,1)
    
    gradients = np.zeros((x.shape[1],1))
    gradients = np.zeros((d,1)) 
    
    for _ in range(epoch):
        gradients = 2/n * (x_new.T.dot(x_new.dot(w) - y))
        gradients[1:] += np.multiply(2/n, np.multiply(alpha, w[1:]))
        gradients[1:] +=  alpha * np.sign(w[1:]) / n #Regularization term
        w = w - lr * gradients        
    return w

In [25]:
rmse_lasso_gd_train = []
rmse_lasso_gd_test = []

lrs = np.linspace(0.01, 0.25, 5)
alphas = np.linspace(0.1, 0.4, 4)
for lr in lrs:
    for alpha in alphas:
        w = train_lasso_gd(X_train, Y_train, alpha = alpha, lr = lr , epoch = 500).reshape((-1,1))
        Y_pred_lasso_gd_train = predict(X_train, w)
        rmse_lasso_gd_train.append(Y_pred_lasso_gd_train)
        print("train rmse for learning rate =", lr , "and alpha", alpha, " is:", computing_rmse(Y_train, Y_pred_lasso_gd_train))

        Y_pred_lasso_gd_test = predict(X_test, w)
        rmse_lasso_gd_test.append(Y_pred_lasso_gd_test)
        print("test rmse for learning rate =", lr , "and alpha", alpha, " is:", computing_rmse(Y_test, Y_pred_lasso_gd_test))
        print('-'*90)

train rmse for learning rate = 0.01 and alpha 0.1  is: 8.054822333180368
test rmse for learning rate = 0.01 and alpha 0.1  is: 11.882988230211483
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.01 and alpha 0.2  is: 8.053447758779821
test rmse for learning rate = 0.01 and alpha 0.2  is: 11.865506201392163
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.01 and alpha 0.30000000000000004  is: 8.053541827504976
test rmse for learning rate = 0.01 and alpha 0.30000000000000004  is: 11.867639222856324
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.01 and alpha 0.4  is: 8.057027608301153
test rmse for learning rate = 0.01 and alpha 0.4  is: 11.898544511603275
------------------------------------------------------------------------------------------
train rmse for learn

# Ridge Regression with Stochastic Gradient Descent

In [26]:
def train_rr_sgd(x, y, alpha, lr, epoch = 400):
    """
        Use stochastic gradient descent method to get best weights for ridge regression.

		:param x: N x D matrix containing N attributes vectors for training
		:param y: N x 1 matrix containing N class vectors for training
	"""
    # n : Number of samples
    # d : Number of features
    n, d = x.shape
    x_new = np.concatenate((np.ones((n,1)), x), axis = 1)
    
    w = np.random.randn(d+1,1)
    for _ in range(epoch):
        rand_i = np.random.randint(n)
        xi, yi = x_new[rand_i:rand_i+1], y[rand_i:rand_i+1]

        gradients = 2 * xi.T.dot(xi.dot(w) - yi)
        gradients[1:] += 2 * alpha * w[1:]
        w = w - lr * gradients
    return w

In [27]:
rmse_rr_sgd_train = []
rmse_rr_sgd_test = []

lrs = np.linspace(0.005, 0.03, 5)
alphas = np.linspace(0.2, 0.4, 4)
for lr in lrs:
    for alpha in alphas:
        w = train_rr_sgd(X_train, Y_train, alpha = alpha, lr = lr , epoch = 500).reshape((-1,1))
        Y_pred_rr_sgd_train = predict(X_train, w)
        rmse_rr_sgd_train.append(Y_pred_rr_sgd_train)
        print("train rmse for learning rate =", lr , "and alpha", alpha, " is:", computing_rmse(Y_train, Y_pred_rr_sgd_train))

        Y_pred_rr_sgd_test = predict(X_test, w)
        rmse_rr_sgd_test.append(Y_pred_rr_sgd_test)
        print("test rmse for learning rate =", lr , "and alpha", alpha, " is:", computing_rmse(Y_test, Y_pred_rr_sgd_test))
        print('-'*90)

train rmse for learning rate = 0.005 and alpha 0.2  is: 8.565474689755815
test rmse for learning rate = 0.005 and alpha 0.2  is: 12.056516285500658
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.005 and alpha 0.26666666666666666  is: 8.133176132038745
test rmse for learning rate = 0.005 and alpha 0.26666666666666666  is: 12.07189812568872
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.005 and alpha 0.33333333333333337  is: 8.206543019391809
test rmse for learning rate = 0.005 and alpha 0.33333333333333337  is: 12.117637112878608
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.005 and alpha 0.4  is: 8.446244248637747
test rmse for learning rate = 0.005 and alpha 0.4  is: 12.00974316878664
-------------------------------------------------------------------------

# Lasso Regression with Stochastic Gradient Descent

In [28]:
def train_lasso_sgd(x, y, alpha, lr, epoch=400):
    """
        Use stochastic gradient descent method to get best weights for lasso regression.

		:param x: N x D matrix containing N attributes vectors for training
		:param y: N x 1 matrix containing N class vectors for training
	"""
    # n : Number of samples
    # d : Number of features
    n, d = x.shape
    x_new = np.concatenate((np.ones((n,1)), x), axis = 1)
    
    w = np.random.randn(d+1,1)

    for _ in range(epoch):
        rand_i = np.random.randint(n)
        xi, yi = x_new[rand_i:rand_i+1], y[rand_i:rand_i+1]

        gradients = 2 * xi.T.dot(xi.dot(w) - yi)
        gradients[1:] += 2 * np.sign(w[1:])
        w = w - lr * gradients
    return w

In [29]:
rmse_lasso_sgd_train = []
rmse_lasso_sgd_test = []

lrs = np.linspace(0.005, 0.03, 5)
alphas = np.linspace(0.2, 0.4, 4)
for lr in lrs:
    for alpha in alphas:
        w = train_lasso_sgd(X_train, Y_train, alpha = alpha, lr = lr , epoch = 500).reshape((-1,1))
        Y_pred_lasso_sgd_train = predict(X_train, w)
        rmse_lasso_sgd_train.append(Y_pred_lasso_sgd_train)
        print("train rmse for learning rate =", lr , "and alpha", alpha, " is:", computing_rmse(Y_train, Y_pred_lasso_sgd_train))

        Y_pred_lasso_sgd_test = predict(X_test, w)
        rmse_lasso_sgd_test.append(Y_pred_lasso_sgd_test)
        print("test rmse for learning rate =", lr , "and alpha", alpha, " is:", computing_rmse(Y_test, Y_pred_lasso_sgd_test))
        print('-'*90)

train rmse for learning rate = 0.005 and alpha 0.2  is: 8.435086743350901
test rmse for learning rate = 0.005 and alpha 0.2  is: 12.170317354109654
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.005 and alpha 0.26666666666666666  is: 8.303225685567698
test rmse for learning rate = 0.005 and alpha 0.26666666666666666  is: 11.970262132727065
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.005 and alpha 0.33333333333333337  is: 8.337415657295763
test rmse for learning rate = 0.005 and alpha 0.33333333333333337  is: 12.066905072357907
------------------------------------------------------------------------------------------
train rmse for learning rate = 0.005 and alpha 0.4  is: 8.172059736335164
test rmse for learning rate = 0.005 and alpha 0.4  is: 11.70285141293292
------------------------------------------------------------------------