In [138]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random

boston = pd.read_csv('Data_Set/housing.csv')
feature_names = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv']

In [139]:
# data cleaning, analysing and removing noise from our model using correlation matrix

boston_frame = pd.DataFrame(boston, columns = feature_names)
correlation_matirx = boston_frame.corr()

# sourceFile = open('Correlation_Matrix.txt', 'w')
# print(correlation_matirx, file = sourceFile)
# sourceFile.close()

correlation_matirx = np.array(correlation_matirx, dtype = 'float32')

print("Features with high correlation to the housing prices:")
irrelevant_features = []

for i in range(len(feature_names) - 1):
    if(abs(correlation_matirx[13][i]) > 0.5):
        print(feature_names[i])
    
    else:
        irrelevant_features.append(i)

Features with high correlation to the housing prices:
rm
ptratio
lstat


In [140]:
# splitting the dataset into two parts for training and testing randomly in a ratio of 80:20

boston_data = np.array(boston_frame, dtype = 'float32')
censored_data = []
boston_data = np.delete(boston_data, (irrelevant_features), 1)
for i in range(len(boston_data) - 13):
    if boston_data[i, -1] == 50:
        censored_data.append(i)

boston_data = np.delete(boston_data, (censored_data), 0)
features = boston_data[:, :-1]
prices = boston_data[:, -1]

x_train, x_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 42)

# m = number of training examples, n = number of features
m = len(y_train)
n = 13 - len(irrelevant_features)

In [141]:
# preprocessing the data through feature scaling

def normalize(x):
    return (x - x.mean(axis = 0)) / x.std(axis = 0)

x_train = normalize(x_train)
y_train = normalize(y_train)
x_test = normalize(x_test)
y_test = normalize(y_test)

In [142]:
# cost function and gradient descent functions

def error(num, theta, x, y):
    cost = 0

    for i in range(num):
        temp = 0
        for j in range(n + 1):
            if j == 0:
                temp += theta[j]
            else:
                temp += (theta[j] * x[i][j - 1])

        cost += ((temp - y[i]) ** 2)

    return cost / (2 * num)

def gradient_descent(alpha, theta, x, y):
    temp = np.zeros(n + 1)

    for i in range(m):
        derivation = 0
        for j in range(n + 1):
            if j == 0:
                derivation += theta[j]
            else:
                derivation += (theta[j] * x[i][j - 1])

        derivation = (derivation - y[i])

        for j in range(n + 1):
            if j == 0:
                temp[j] += derivation
            else:
                temp[j] += (derivation * x[i][j - 1])

    temp = alpha * temp / m
    theta -= temp

    return theta

In [143]:
theta = []
alpha = 0.3
iterations = 75

for _ in range(n + 1):
    theta.append(random.random())

for _ in range(iterations):
    theta = gradient_descent(alpha, theta, x_train, y_train)

In [144]:
print(error(len(y_test), theta, x_test, y_test))

# for i in range(len(y_test)):
#     price = 0
#     for j in range(n + 1):
#         if j == 0:
#             price += theta[j]
#         else:
#             price += (theta[j] * x_test[i][j - 1])

#     print(price, y_test[i])


0.15898271162932892


In [145]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

for i in range(len(y_test)):
    price = 0
    for j in range(n + 1):
        if j == 0:
            price += theta[j]
        else:
            price += (theta[j] * x_test[i][j - 1])
    
    print(price, model.predict(x_test[i].reshape(1, -1)))

-0.6356599879645601 [-0.63566464]
0.44447416560431957 [0.44447857]
-0.09860545354892092 [-0.09864734]
-1.3099076870999378 [-1.3099121]
-1.0496169037699374 [-1.0496416]
-0.26044003594412174 [-0.26040882]
-1.1715339320966844 [-1.171545]
1.7521336566556278 [1.7520962]
-0.5058863035144688 [-0.5058632]
0.9962559261605248 [0.9962549]
0.2012073350138418 [0.20119178]
-0.4686979646089634 [-0.4687006]
-1.6716343445527553 [-1.6717057]
-1.0997136955100826 [-1.0997211]
-0.538470203756257 [-0.5385314]
0.5500736130950168 [0.5500904]
-0.3653688407669642 [-0.36539233]
-0.3306816883988337 [-0.33069518]
0.04734486876367605 [0.04729325]
-0.05766540885769045 [-0.05767908]
0.143483222885734 [0.14348437]
0.17328403295019898 [0.1732836]
-0.9610321386177936 [-0.9610481]
1.3603754460092536 [1.3603568]
0.19581897353113925 [0.19581938]
0.22416045058013442 [0.22417471]
0.4779436780509252 [0.47792917]
1.2876126039629172 [1.2876201]
1.6405152558565685 [1.6405044]
-0.8599405035386397 [-0.8599439]
0.4596109787295855 [