# ANN Assignment 1: Linear Regression
# Submitted By: Sarim Aeyzaz (21i-0328)

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Getting Regression data
# Dataset Link: https://www.kaggle.com/datasets/quantbruce/real-estate-price-prediction

data = pd.read_csv('/content/drive/MyDrive/ANN/Assignment1/Real estate.csv')
data

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.917,19.5,306.59470,9,24.98034,121.53951,42.2
2,3,2013.583,13.3,561.98450,5,24.98746,121.54391,47.3
3,4,2013.500,13.3,561.98450,5,24.98746,121.54391,54.8
4,5,2012.833,5.0,390.56840,5,24.97937,121.54245,43.1
...,...,...,...,...,...,...,...,...
409,410,2013.000,13.7,4082.01500,0,24.94155,121.50381,15.4
410,411,2012.667,5.6,90.45606,9,24.97433,121.54310,50.0
411,412,2013.250,18.8,390.96960,7,24.97923,121.53986,40.6
412,413,2013.000,8.1,104.81010,5,24.96674,121.54067,52.5


In [None]:
x = data.iloc[:5, 1:7]
y = data.iloc[:5, 7]

In [None]:
x

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,2012.917,32.0,84.87882,10,24.98298,121.54024
1,2012.917,19.5,306.5947,9,24.98034,121.53951
2,2013.583,13.3,561.9845,5,24.98746,121.54391
3,2013.5,13.3,561.9845,5,24.98746,121.54391
4,2012.833,5.0,390.5684,5,24.97937,121.54245


In [None]:
y

0    37.9
1    42.2
2    47.3
3    54.8
4    43.1
Name: Y house price of unit area, dtype: float64

In [None]:
class LinearRegression:
  def __init__(self):
    self.theta = None

  # Loss function (Mean Squared Error)
  def __cost(self, Y, y_pred):
    return np.mean(np.square(Y - y_pred))

  # Calculates partial derrivative of MSE w.r.t every X value
  def __derrivative(self, X, Y, y_pred):
    X = np.hstack((X, np.ones((X.shape[0], 1))))
    # Formula below is basically: 2 / (no. of samples) * [ (features, samples) * (samples, 1) ]
    return 2/X.shape[0] * np.dot(X.T, (y_pred - Y)) # Returns a (features + 1, 1) shape array

  # Update theta values based on partial derrivate error
  def __gradient_descent(self, lr, del_theta):
    self.theta -= lr * del_theta

  # Predict values given a feature vector
  def Predict(self, X):
    X = np.hstack((X, np.ones((X.shape[0], 1))))
    return np.dot(X, self.theta) # (samples, features+1) * (features+1, 1)

  # Return weights (thetas) of the model
  def Get_Weights(self):
    return self.theta

  def Train(self, X, Y, alpha = 0.0001, loss_at_iter = 50, max_iter = None):

    # Setting up some stuff
    self.lr = alpha
    counter = 0
    convergence_check = False
    X = np.array(X) # Dimensions are: (samples, features)
    Y = np.array(Y).reshape(-1, 1) # Dimensions are: (samples, 1)

    # Generating random weights
    self.theta = np.random.random(size=(X.shape[1] + 1, 1))  # Dimensions of theta = (features + 1, 1)

    # Handling Termination by changes in old and new loss values incase max_iter is not defined
    if max_iter is None:
      old_loss = 0
      convergence_check = True
      print("Convergence Criteria will be used")

    while(True):

      predictions = self.Predict(X) # (samples, 1)

      loss = self.__cost(Y, predictions) # integer

      if (counter % loss_at_iter == 0):
          print(f"Loss at iteration {counter} = {loss}")

      del_theta = self.__derrivative(X, Y, predictions) # (features + 1, 1)

      self.__gradient_descent(alpha, del_theta)

      # Either do convergence check or max iteration check
      if convergence_check:
        if abs(old_loss - loss) / loss < 0.0001: # If the loss difference is lesser than 0.01%, break
          print("Convergence Reached \n")
          break
      else:
        if (counter >= max_iter - 1): # If max iterations are reached, break
          print("Maximum Iterations Reached \n")
          break

      old_loss = loss
      counter +=1

    print(f"Final Loss at iteration {counter} = {loss}")
    print("Training Complete!")

## Testing on 1 feature dataset

In [None]:
data_x = np.random.randn(1000,1)
data_y = data_x*np.random.randn() + np.random.randn() +(np.random.randn(1000,1)*0.9) # wx + w + noise

model = LinearRegression()
model.Train(data_x, data_y, 0.001, 50, 1000)

Loss at iteration 0 = 1.7983266580673793
Loss at iteration 50 = 1.613362837038734
Loss at iteration 100 = 1.4620002100698735
Loss at iteration 150 = 1.3381342518377082
Loss at iteration 200 = 1.2367695575603566
Loss at iteration 250 = 1.1538183148775394
Loss at iteration 300 = 1.085935396124549
Loss at iteration 350 = 1.0303834161047698
Loss at iteration 400 = 0.9849223099233259
Loss at iteration 450 = 0.94771897507977
Loss at iteration 500 = 0.9172733317870275
Loss at iteration 550 = 0.892357818080075
Loss at iteration 600 = 0.871967878451304
Loss at iteration 650 = 0.8552814483878659
Loss at iteration 700 = 0.8416258001987112
Loss at iteration 750 = 0.8304504125593706
Loss at iteration 800 = 0.8213047692610873
Loss at iteration 850 = 0.8138201915386514
Loss at iteration 900 = 0.807694971095866
Loss at iteration 950 = 0.802682204117698
Maximum Iterations Reached 

Final Loss at iteration 999 = 0.7986540840462888
Training Complete!


# Testing on 6 feature dataset (Max Iterations given)

In [None]:
model = LinearRegression()
model.Train(x, y, 0.0000001, 50, 1000)

Loss at iteration 0 = 4563531.437322646
Loss at iteration 50 = 257.73133832082704
Loss at iteration 100 = 142.39606302119668
Loss at iteration 150 = 80.09646554758264
Loss at iteration 200 = 46.44453566929016
Loss at iteration 250 = 28.266884557703683
Loss at iteration 300 = 18.447795449143904
Loss at iteration 350 = 13.143654090632486
Loss at iteration 400 = 10.278299918622848
Loss at iteration 450 = 8.730277344143058
Loss at iteration 500 = 7.8938226992758
Loss at iteration 550 = 7.441727644213783
Loss at iteration 600 = 7.19724784226978
Loss at iteration 650 = 7.06491335343936
Loss at iteration 700 = 6.993155286601305
Loss at iteration 750 = 6.95411827139537
Loss at iteration 800 = 6.932755995130163
Loss at iteration 850 = 6.920941065338124
Loss at iteration 900 = 6.9142834024984925
Loss at iteration 950 = 6.910411668109529
Maximum Iterations Reached 

Final Loss at iteration 999 = 6.908082676806555
Training Complete!


In [None]:
test_x = np.array([[2012.917, 32.0, 84.87882, 10, 24.98298, 121.54024],
                   [2012, 32, 84, 10, 24, 121]])

print(f"Predicted values are: {model.Predict(test_x)}")

Predicted values are: [[36.62582392]
 [35.54930044]]


In [None]:
print(f"Weights are: {model.Get_Weights()}")

Weights are: [[-0.01847694]
 [ 0.01127132]
 [ 0.03280325]
 [ 0.34696607]
 [ 0.87930063]
 [ 0.37076809]
 [ 0.17294322]]


# Testing on 6 feature dataset (No Max Iterations given, convergence used instead)

In [None]:
model = LinearRegression()
model.Train(x, y, 0.0000001, 50)

Convergence Criteria will be used
Loss at iteration 0 = 2878250.6126392754
Loss at iteration 50 = 41.401496229219404
Loss at iteration 100 = 25.57492767084411
Loss at iteration 150 = 17.025778655884306
Loss at iteration 200 = 12.407589210552278
Loss at iteration 250 = 9.912740836249851
Loss at iteration 300 = 8.564833198809271
Loss at iteration 350 = 7.836455446097797
Loss at iteration 400 = 7.442722170690966
Loss at iteration 450 = 7.229750123754738
Loss at iteration 500 = 7.114417974141732
Loss at iteration 550 = 7.051827035648538
Convergence Reached 

Final Loss at iteration 572 = 7.034271503130858
Training Complete!


In [None]:
test_x = np.array([[2012.917, 32.0, 84.87882, 10, 24.98298, 121.54024],
                   [2012, 32, 84, 10, 24, 121]])

print(f"Predicted values are: {model.Predict(test_x)}")

Predicted values are: [[37.06193418]
 [36.06499782]]


In [None]:
print(f"Weights are: {model.Get_Weights()}")

Weights are: [[-0.04883885]
 [ 0.54807992]
 [ 0.05334319]
 [ 0.22370839]
 [ 0.88125242]
 [ 0.73552708]
 [ 0.99603872]]
