In [47]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random

boston = pd.read_csv('Data_Set/housing.csv')
feature_names = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv']

In [48]:
# data cleaning, analysing and removing noise from our model using correlation matrix

boston_frame = pd.DataFrame(boston, columns = feature_names)
correlation_matirx = boston_frame.corr()

# sourceFile = open('Correlation_Matrix.txt', 'w')
# print(correlation_matirx, file = sourceFile)
# sourceFile.close()

correlation_matirx = np.array(correlation_matirx, dtype = 'float32')

print("Features with high correlation to the housing prices:")
irrelevant_features = []

for i in range(len(feature_names) - 1):
    if(abs(correlation_matirx[13][i]) > 0.3):
        print(feature_names[i])
    
    else:
        irrelevant_features.append(i)

Features with high correlation to the housing prices:
crim
zn
indus
nox
rm
age
rad
tax
ptratio
black
lstat


In [49]:
# splitting the dataset into two parts for training and testing randomly in a ratio of 80:20

boston_data = np.array(boston_frame, dtype = 'float32')

boston_data = np.delete(boston_data, (irrelevant_features), 1)
features = boston_data[:, :-1]
prices = boston_data[:, -1]

x_train, x_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 42)

# m = number of training examples, n = number of features
m = len(y_train)
n = 13 - len(irrelevant_features)

In [50]:
# preprocessing the data through feature scaling

def normalize(x):
    return (x - x.mean(axis = 0)) / x.std(axis = 0)

x_train = normalize(x_train)
y_train = normalize(y_train)
x_test = normalize(x_test)
y_test = normalize(y_test)

In [51]:
# cost function and gradient descent functions

def error(num, theta, x, y):
    cost = 0

    for i in range(num):
        temp = 0
        for j in range(n + 1):
            if j == 0:
                temp += theta[j]
            else:
                temp += (theta[j] * x[i][j - 1])

        cost += ((temp - y[i]) ** 2)

    return cost / (2 * m)

def gradient_descent(alpha, theta, x, y):
    temp = np.zeros(n + 1)

    for i in range(m):
        derivation = 0
        for j in range(n + 1):
            if j == 0:
                derivation += theta[j]
            else:
                derivation += (theta[j] * x[i][j - 1])

        derivation = (derivation - y[i])

        for j in range(n + 1):
            if j == 0:
                temp[j] += derivation
            else:
                temp[j] += (derivation * x[i][j - 1])

    temp = alpha * temp / m
    theta -= temp

    return theta

In [52]:
theta = []
alpha = 0.3
iterations = 75

for _ in range(n + 1):
    theta.append(random.random())

for _ in range(iterations):
    print(error(m, theta, x_train, y_train))
    theta = gradient_descent(alpha, theta, x_train, y_train)

2.2238391841090244
1.223409109771116
0.7536920581808175
0.5187626081496544
0.3921970536216206
0.3184501427504343
0.27218332236496623
0.24126711099550593
0.21954847434086605
0.20369836036930467
0.1917937067422649
0.1826537571842427
0.1755142368662919
0.16985854891240992
0.16532524137144622
0.16165441331189767
0.15865507421856243
0.1561843787817883
0.15413392391653374
0.15242042805166173
0.15097923045626974
0.14975966179073122
0.14872168876212113
0.14783344550108773
0.14706939374002298
0.14640893619099346
0.14583536121882404
0.14533503271908263
0.14489676347132954
0.14451132710155215
0.1441710756482762
0.14386963818990553
0.14360168210923796
0.143362723047183
0.14314897290743925
0.14295721774465314
0.14278471922876107
0.14262913478924888
0.14248845262051538
0.14236093855706758
0.14224509246618428
0.14213961230121552
0.1420433643446115
0.14195535847147114
0.14187472750106606
0.14180070989003932
0.14173263516801976
0.14166991163280226
0.14161201591469805
0.14155848409328342
0.1415089041085

In [53]:
print(error(len(y_test), theta, x_test, y_test))

for i in range(len(y_test)):
    price = 0
    for j in range(n + 1):
        if j == 0:
            price += theta[j]
        else:
            price += (theta[j] * x_test[i][j - 1])

    print(price, y_test[i])
# import matplotlib.pyplot as plt

# y = y_train
# x = x_train[:, 0]

# plt.plot(x, y, 'ro')

0.04752313892541561
0.7923443684077376 0.24660048
1.0738320233017475 1.274216
-0.5509910014207093 -0.92114437
0.3886060226747675 0.15318076
-0.17181867049954977 -0.62920815
0.30506139039287566 -0.1737877
-0.25909637495130733 -0.43069166
-0.8265798995892744 -0.8744346
0.22606349755391633 -0.22049746
-0.14140732647892001 -0.54746616
0.17941693021692606 0.0013740192
0.2898402052520409 -0.3022397
-3.154675816449383 -1.691856
0.22704716851707238 -0.033658236
-0.14117891866510301 -0.34894943
0.5554145553200123 0.97060215
-0.11532301536427446 -0.31391716
-1.8213770168282764 -1.3181777
2.0610652446458624 3.3294468
-0.2995983879340487 -0.8627571
0.45448547618576196 0.4334397
0.70933002018164 0.8888601
-1.0128375079224838 -1.0262414
0.18710589476535583 0.10647101
-0.3633761008167539 -0.8510797
-0.6719855164774847 -0.8977895
0.17678417308207992 -0.13875546
-0.6009355730009972 -0.7693376
-0.09497398545046837 0.024729004
-0.28378334914580156 -0.3723044
0.006251674490224068 0.18821323
0.480204176237