In [122]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

boston = pd.read_csv('Data_Set/housing.csv')
feature_names = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv']

In [123]:
# data cleaning, analysing and removing noise from our model using correlation matrix

boston_frame = pd.DataFrame(boston, columns = feature_names)
correlation_matirx = boston_frame.corr()

# sourceFile = open('Correlation_Matrix.txt', 'w')
# print(correlation_matirx, file = sourceFile)
# sourceFile.close()

correlation_matirx = np.array(correlation_matirx, dtype = 'float32')

print("Features with high correlation to the housing prices:")
irrelevant_features = []

for i in range(len(feature_names) - 1):
    if(abs(correlation_matirx[13][i]) > 0.3):
        print(feature_names[i])
    
    else:
        irrelevant_features.append(i)

Features with high correlation to the housing prices:
crim
zn
indus
nox
rm
age
rad
tax
ptratio
black
lstat


In [124]:
# splitting the dataset into two parts for training and testing randomly in a ratio of 80:20

boston_data = np.array(boston_frame, dtype = 'float32')
censored_data = []
boston_data = np.delete(boston_data, (irrelevant_features), 1)
for i in range(len(boston_data) - 13):
    if boston_data[i, -1] == 50:
        censored_data.append(i)

boston_data = np.delete(boston_data, (censored_data), 0)
features = boston_data[:, :-1]
prices = boston_data[:, -1]

x_train, x_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 42)

# m = number of training examples, n = number of features
m = len(y_train)
n = 13 - len(irrelevant_features)

In [125]:
# preprocessing the data through feature scaling

def normalize(x):
    return (x - x.mean(axis = 0)) / x.std(axis = 0)

x_train = normalize(x_train)
y_train = normalize(y_train)
x_test = normalize(x_test)
y_test = normalize(y_test)

In [126]:
# cost function and gradient descent functions

def error(num, theta, x, y):
    cost = 0

    for i in range(num):
        temp = 0
        for j in range(n + 1):
            if j == 0:
                temp += theta[j]
            else:
                temp += (theta[j] * x[i][j - 1])

        cost += ((temp - y[i]) ** 2)

    return cost / (2 * num)

def gradient_descent(alpha, theta, x, y):
    temp = np.zeros(n + 1)

    for i in range(m):
        derivation = 0
        for j in range(n + 1):
            if j == 0:
                derivation += theta[j]
            else:
                derivation += (theta[j] * x[i][j - 1])

        derivation = (derivation - y[i])

        for j in range(n + 1):
            if j == 0:
                temp[j] += derivation
            else:
                temp[j] += (derivation * x[i][j - 1])

    temp = alpha * temp / m
    theta -= temp

    return theta

In [127]:
theta = []
alpha = 0.3
iterations = 75

for _ in range(n + 1):
    theta.append(random.random())

for _ in range(iterations):
    theta = gradient_descent(alpha, theta, x_train, y_train)

In [128]:
print(error(len(y_test), theta, x_test, y_test))

# for i in range(len(y_test)):
#     price = 0
#     for j in range(n + 1):
#         if j == 0:
#             price += theta[j]
#         else:
#             price += (theta[j] * x_test[i][j - 1])

#     print(price, y_test[i])


0.1316606530430089


In [137]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

for i in range(len(y_test)):
    price = 0
    for j in range(n + 1):
        if j == 0:
            price += theta[j]
        else:
            price += (theta[j] * x_test[i][j - 1])
    
    print(price, model.predict(x_test[i].reshape(1, -1)))

-0.6099226578420438 [-0.5938799]
0.5776160804596094 [0.576762]
-0.421707482761 [-0.41511276]
-1.598543003227313 [-1.7042704]
-1.1053001952356682 [-1.0970433]
-0.12166278543171373 [-0.11081198]
-0.9460023075043676 [-0.9531246]
1.7811738676801618 [1.7846191]
-0.4548049459029871 [-0.47842276]
1.058234375636107 [1.0739869]
0.3480510937475764 [0.36125675]
-0.25919988350462103 [-0.15617746]
-1.7338315527229187 [-1.7284436]
-0.7765143553179716 [-0.74987984]
-0.46512396028254643 [-0.45792988]
0.5842740702307765 [0.5872382]
-0.3877564129733784 [-0.37925404]
-0.33679039580257053 [-0.32916752]
-0.5318961583935556 [-0.56092006]
0.1512159170827206 [0.19684586]
0.1772563720317598 [0.16162187]
0.3974917013352614 [0.41470048]
-0.9732100827238999 [-0.9692043]
1.436140159760622 [1.4779028]
0.21500806288570112 [0.2018584]
0.22212588211692508 [0.20608027]
0.24311646446774626 [0.23288088]
1.289924918579858 [1.3159133]
1.7342860114144962 [1.7683858]
-0.7684687068940976 [-0.7526684]
0.42981202072230085 [0.39