In [19]:
import pandas as pd
import numpy as np

boston = pd.read_csv('Data_Set/housing.csv')
feature_names = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv']

In [20]:
# data cleaning, analysing and removing noise from our model using correlation matrix

boston_frame = pd.DataFrame(boston, columns = feature_names)
correlation_matirx = boston_frame.corr()

# sourceFile = open('Correlation_Matrix.txt', 'w')
# print(correlation_matirx, file = sourceFile)
# sourceFile.close()

correlation_matirx = np.array(correlation_matirx, dtype = 'float32')

print("Features with high correlation to the housing prices:")
irrelevant_features = []

for i in range(len(feature_names) - 1):
    if(abs(correlation_matirx[13][i]) > 0.5):
        print(feature_names[i])
    
    else:
        irrelevant_features.append(i)

Features with high correlation to the housing prices:
rm
ptratio
lstat


In [21]:
# splitting the dataset into two parts for training and testing randomly in a ratio of 80:20

from sklearn.model_selection import train_test_split
boston_data = np.array(boston_frame, dtype = 'float32')

boston_data = np.delete(boston_data, (irrelevant_features), 1)
features = boston_data[:, :-1]
prices = boston_data[:, -1]

x_train, x_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 42)

In [22]:
# preprocessing the data through feature scaling

def normalize(x):
    return (x - x.mean(axis = 0)) / x.std(axis = 0)

x_train = normalize(x_train)
y_train = normalize(y_train)
x_test = normalize(x_test)
y_test = normalize(y_test)

In [23]:
import random

# number of training examples
m = len(y_train)

def error(m, w0, w1, w2, w3, x, y):
    cost = 0
    for i in range(m):
        cost += ((w0 + w1*x[i][0] + w2*x[i][1] + w3*x[i][2] - y[i]) ** 2)

    return cost / (0.5 * m)

def gradient_descent(alpha, w0, w1, w2, w3, x, y):
    temp0, temp1, temp2, temp3 = 0, 0, 0, 0
    for i in range(m):
        temp = (w0 + w1*x[i][0] + w2*x[i][1] + w3*x[i][2] - y[i])
        temp0 += temp
        temp1 += (temp * x[i][0])
        temp2 += (temp * x[i][1])
        temp3 += (temp * x[i][2])
    
    w0 -= (alpha * temp0 / m)
    w1 -= (alpha * temp1 / m)
    w2 -= (alpha * temp2 / m)
    w3 -= (alpha * temp3 / m)

    return w0, w1, w2, w3

In [24]:
w0, w1, w2, w3 = random.random(), random.random(), random.random(), random.random()
# print(error(w0, w1, w2, w3, x_train, y_train))
print(w0, w1, w2, w3)

for _ in range(100):
    print(error(m, w0, w1, w2, w3, x_train, y_train))
    w0, w1, w2, w3 = gradient_descent(0.03, w0, w1, w2, w3, x_train, y_train)

m = len(y_test)
print("Accuracy:", error(m, w0, w1, w2, w3, x_test, y_test))

0.5728862894636689 0.20699271117883422 0.930205511570624 0.36207233288176477
7.078332257541675
6.457036629899338
5.9000475060750475
5.40048393352039
4.952212602442443
4.549766025957399
4.188269705241553
3.863377291333574
3.5712128640512066
3.308319545310475
3.0716137502963603
2.858344456606075
2.666056939713794
2.4925604838202537
2.3358996311789024
2.194328581069199
2.0662883923722517
1.9503866817778286
1.8453795435331724
1.750155446795535
1.6637208934818501
1.5851876433870535
1.513761334591774
1.4487313460904305
1.3894617663996456
1.33538334688341
1.2859863318599065
1.2408140694163583
1.1994573174147902
1.1615491685663555
1.1267605268131384
1.0947960746977663
1.065390678024027
1.0383061800061253
1.0133285423503495
0.9902652953822918
0.9689432634885818
0.9492065358412204
0.930914655664923
0.9139410042385427
0.898171358430145
0.8835026028872446
0.8698415800706281
0.8571040631600417
0.8452138384978464
0.8341018856948051
0.8237056448201661
0.8139683612537097
0.8048384998062931
0.796269220