In [61]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random

boston = pd.read_csv('Data_Set/housing.csv')
feature_names = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv']

In [62]:
# data cleaning, analysing and removing noise from our model using correlation matrix

boston_frame = pd.DataFrame(boston, columns = feature_names)
correlation_matirx = boston_frame.corr()

# sourceFile = open('Correlation_Matrix.txt', 'w')
# print(correlation_matirx, file = sourceFile)
# sourceFile.close()

correlation_matirx = np.array(correlation_matirx, dtype = 'float32')

print("Features with high correlation to the housing prices:")
irrelevant_features = []

for i in range(len(feature_names) - 1):
    if(abs(correlation_matirx[13][i]) > 0.3):
        print(feature_names[i])
    
    else:
        irrelevant_features.append(i)

Features with high correlation to the housing prices:
crim
zn
indus
nox
rm
age
rad
tax
ptratio
black
lstat


In [63]:
# splitting the dataset into two parts for training and testing randomly in a ratio of 80:20

boston_data = np.array(boston_frame, dtype = 'float32')

boston_data = np.delete(boston_data, (irrelevant_features), 1)
features = boston_data[:, :-1]
prices = boston_data[:, -1]

x_train, x_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 42)

# m = number of training examples, n = number of features
m = len(y_train)
n = 13 - len(irrelevant_features)

In [64]:
# preprocessing the data through feature scaling

def normalize(x):
    return (x - x.mean(axis = 0)) / x.std(axis = 0)

xo = x_test
yo = y_test
x_train = normalize(x_train)
y_train = normalize(y_train)
x_test = normalize(x_test)
y_test = normalize(y_test)

In [65]:
# cost function and gradient descent functions

def error(num, theta, x, y):
    cost = 0

    for i in range(num):
        temp = 0
        for j in range(n + 1):
            if j == 0:
                temp += theta[j]
            else:
                temp += (theta[j] * x[i][j - 1])

        cost += ((temp - y[i]) ** 2)

    return cost / (2 * m)

def gradient_descent(alpha, theta, x, y):
    temp = np.zeros(n + 1)

    for i in range(m):
        derivation = 0
        for j in range(n + 1):
            if j == 0:
                derivation += theta[j]
            else:
                derivation += (theta[j] * x[i][j - 1])

        derivation = (derivation - y[i])

        for j in range(n + 1):
            if j == 0:
                temp[j] += derivation
            else:
                temp[j] += (derivation * x[i][j - 1])

    temp = alpha * temp / m
    theta -= temp

    return theta

In [66]:
theta = []
alpha = 0.3
iterations = 75

for _ in range(n + 1):
    theta.append(random.random())

for _ in range(iterations):
    print(error(m, theta, x_train, y_train))
    theta = gradient_descent(alpha, theta, x_train, y_train)

3.9758413446782757
1.736736240875559
0.8307602809651498
0.4587522675805239
0.3020856662400888
0.2332643203278807
0.20097369697148873
0.1843630544951446
0.1748263580313563
0.16871994806040622
0.16443816483040213
0.16123037620780273
0.1587168474101369
0.15668693256565988
0.15501251129406116
0.15360923888945974
0.15241813632065712
0.15139616162043046
0.15051096088631516
0.14973770059962796
0.14905702042808094
0.1484536387353872
0.14791536595271165
0.147432387539793
0.14699673275134664
0.14660187537631242
0.1462424302673479
0.1459139205205207
0.14561259740996646
0.14533530010887744
0.14507934566842573
0.14484244217675965
0.1446226197926038
0.1444181756456004
0.1442276295551246
0.14404968823499198
0.14388321618921468
0.14372721191063187
0.14358078830358664
0.1434431564883737
0.1433136123269333
0.1431915251495562
0.14307632827106173
0.1429675109695094
0.1428646116665511
0.14276721210032245
0.1426749323225089
0.1425874263833956
0.14250437859420578
0.14242550027632211
0.14235052692318803
0.142

In [67]:
print(error(len(y_test), theta, xo, yo))

for i in range(len(y_test)):
    price = 0
    for j in range(n + 1):
        if j == 0:
            price += theta[j]
        else:
            price += (theta[j] * xo[i][j - 1])

    print(price, yo[i])
# import matplotlib.pyplot as plt

# y = y_train
# x = x_train[:, 0]

# plt.plot(x, y, 'ro')

94.84612431348353
24.414518284557783 23.6
25.553710248054998 32.4
-18.821770542017862 13.6
18.678649703042744 22.8
-12.989093681345969 16.1
9.858631550327209 20.0
18.80175843882489 17.8
4.13623486294235 14.0
-9.874856221299826 19.6
12.683138993370275 16.8
24.189726413377038 21.5
18.590147526245982 18.9
-63.237389679284526 7.0
9.934712645598474 21.2
22.178218011694735 18.5
-13.44312147743263 29.8
30.3773806869227 18.8
-20.705958837555812 10.2
24.331910465134033 50.0
-11.720060200732828 14.1
26.490656347006524 25.2
23.919342711423784 29.1
12.64748803925596 12.7
23.795667130574575 22.4
-20.896614303657596 14.2
-17.906508660391633 13.8
11.19082441939664 20.3
-63.052280503155295 14.9
22.83225625513593 21.7
12.060142143132921 18.3
22.301135071644044 23.1
23.533603755160073 23.8
-10.323612178401593 15.0
-10.640868113358287 20.8
-21.50392037139425 19.1
-4.095602871861585 19.4
29.48613283737272 34.7
27.702459352742398 19.5
20.105783165916925 24.4
16.020702635392706 23.4
9.22956824467186 19.7
29