In [138]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random

boston = pd.read_csv('Data_Set/housing.csv')
feature_names = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv']

In [139]:
# data cleaning, analysing and removing noise from our model using correlation matrix

boston_frame = pd.DataFrame(boston, columns = feature_names)
correlation_matirx = boston_frame.corr()

# sourceFile = open('Correlation_Matrix.txt', 'w')
# print(correlation_matirx, file = sourceFile)
# sourceFile.close()

correlation_matirx = np.array(correlation_matirx, dtype = 'float32')

print("Features with high correlation to the housing prices:")
irrelevant_features = []

for i in range(len(feature_names) - 1):
    if(abs(correlation_matirx[13][i]) > 0.3):
        print(feature_names[i])
    
    else:
        irrelevant_features.append(i)

Features with high correlation to the housing prices:
crim
zn
indus
nox
rm
age
rad
tax
ptratio
black
lstat


In [140]:
# splitting the dataset into two parts for training and testing randomly in a ratio of 80:20

boston_data = np.array(boston_frame, dtype = 'float32')

boston_data = np.delete(boston_data, (irrelevant_features), 1)
features = boston_data[:, :-1]
prices = boston_data[:, -1]

x_train, x_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 42)

# m = number of training examples, n = number of features
m = len(y_train)
n = 13 - len(irrelevant_features)

In [141]:
# preprocessing the data through feature scaling

def normalize(x):
    return (x - x.mean(axis = 0)) / x.std(axis = 0)

xo = x_test
yo = y_test
x_train = normalize(x_train)
y_train = normalize(y_train)
x_test = normalize(x_test)
y_test = normalize(y_test)

In [142]:
# cost function and gradient descent functions

def error(num, theta, x, y):
    cost = 0

    for i in range(num):
        temp = 0
        for j in range(n + 1):
            if j == 0:
                temp += theta[j]
            else:
                temp += (theta[j] * x[i][j - 1])

        cost += ((temp - y[i]) ** 2)

    return cost / (2 * num)

def gradient_descent(alpha, theta, x, y):
    temp = np.zeros(n + 1)

    for i in range(m):
        derivation = 0
        for j in range(n + 1):
            if j == 0:
                derivation += theta[j]
            else:
                derivation += (theta[j] * x[i][j - 1])

        derivation = (derivation - y[i])

        for j in range(n + 1):
            if j == 0:
                temp[j] += derivation
            else:
                temp[j] += (derivation * x[i][j - 1])

    temp = alpha * temp / m
    theta -= temp

    return theta

In [143]:
theta = []
alpha = 0.3
iterations = 75

for _ in range(n + 1):
    theta.append(random.random())

for _ in range(iterations):
    print(error(m, theta, x_train, y_train))
    theta = gradient_descent(alpha, theta, x_train, y_train)

5.1582560694523565
2.347295179094553
1.1736837814088805
0.6689226824347081
0.44165035613117004
0.33232404569605695
0.27501638738852885
0.24191101222916137
0.2208975534769403
0.2064591606217893
0.19592430455517806
0.18789890157098016
0.18159401965863395
0.17652721731972362
0.17238333551390714
0.16894556540296773
0.16605887120599616
0.1636091570305898
0.16151060273468426
0.15969753332207734
0.15811896928144253
0.15673485156579603
0.1555133598691322
0.15442896960958982
0.15346102144826043
0.15259265389979035
0.15180999753294444
0.15110156030857383
0.15045775428637187
0.14987052802205367
0.1493330787508418
0.14883962533737327
0.14838522788511044
0.14796564344569607
0.14757720985716022
0.1472167516469334
0.1468815033530868
0.14656904667938464
0.14627725870165253
0.14600426895260263
0.14574842367871274
0.1455082559217853
0.14528246035572306
0.145069872025304
0.14486944830285645
0.1446802535116323
0.14450144576960228
0.14433226569061247
0.1441720266461419
0.14402010634394358
0.143875939522509

In [144]:
print(error(len(y_test), theta, x_test, y_test))

for i in range(len(y_test)):
    price = 0
    for j in range(n + 1):
        if j == 0:
            price += theta[j]
        else:
            price += (theta[j] * xo[i][j - 1])

    print(price, yo[i])


0.1928442596714695
34.86258598748316 23.6
34.41561750098212 32.4
6.514571179551217 13.6
29.639951227602605 22.8
10.166117149241035 16.1
23.96689564331067 20.0
29.540763925726328 17.8
19.583034841106496 14.0
13.269686473422253 19.6
26.530364425288987 16.8
33.79441932457152 21.5
29.49019161070051 18.9
-39.995820434149294 7.0
24.108423797931895 21.2
32.30136438922608 18.5
9.770592398704917 29.8
36.74343727437669 18.8
2.476138755455173 10.2
33.08741619019535 50.0
11.427968505651938 14.1
34.41269184165973 25.2
32.792516781026265 29.1
23.495367954846877 12.7
33.51052686816699 22.4
2.2395989330755697 14.2
5.239782775746748 13.8
25.407599281585675 20.3
-39.932788035403455 14.9
32.54993561014091 21.7
25.9336016301257 18.3
32.74802170316085 23.1
33.7146098878456 23.8
12.843990565378096 15.0
12.474368196968497 20.8
1.6443895768428654 19.1
10.042760261581964 19.4
38.09551201648908 34.7
35.634516084265435 19.5
29.91562146592546 24.4
26.991630760999612 23.4
23.1134708884318 19.7
37.53065084732823 28