In [169]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random

boston = pd.read_csv('Data_Set/housing.csv')
feature_names = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv']

In [170]:
# data cleaning, analysing and removing noise from our model using correlation matrix

boston_frame = pd.DataFrame(boston, columns = feature_names)
correlation_matirx = boston_frame.corr()

# sourceFile = open('Correlation_Matrix.txt', 'w')
# print(correlation_matirx, file = sourceFile)
# sourceFile.close()

correlation_matirx = np.array(correlation_matirx, dtype = 'float32')

print("Features with high correlation to the housing prices:")
irrelevant_features = []

for i in range(len(feature_names) - 1):
    if(abs(correlation_matirx[13][i]) > 0.5):
        print(feature_names[i])
    
    else:
        irrelevant_features.append(i)

Features with high correlation to the housing prices:
rm
ptratio
lstat


In [171]:
# splitting the dataset into two parts for training and testing randomly in a ratio of 80:20

boston_data = np.array(boston_frame, dtype = 'float32')

boston_data = np.delete(boston_data, (irrelevant_features), 1)
features = boston_data[:, :-1]
prices = boston_data[:, -1]

x_train, x_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 42)

# m = number of training examples, n = number of features
m = len(y_train)
n = 13 - len(irrelevant_features)

In [172]:
# preprocessing the data through feature scaling

def normalize(x):
    return (x - x.mean(axis = 0)) / x.std(axis = 0)

x_train = normalize(x_train)
y_train = normalize(y_train)
x_test = normalize(x_test)
y_test = normalize(y_test)

In [173]:
# cost function and gradient descent functions

def error(theta, x, y):
    cost = 0

    for i in range(m):
        temp = 0
        for j in range(n + 1):
            if j == 0:
                temp += theta[j]
            else:
                temp += (theta[j] * x[i][j - 1])

        cost += ((temp - y[i]) ** 2)

    return cost / (2 * m)

def gradient_descent(alpha, theta, x, y):
    temp = np.zeros(n + 1)

    for i in range(m):
        derivation = 0
        for j in range(n + 1):
            if j == 0:
                derivation += theta[j]
            else:
                derivation += (theta[j] * x[i][j - 1])

        derivation = (derivation - y[i])

        for j in range(n + 1):
            if j == 0:
                temp[j] += derivation
            else:
                temp[j] += (derivation * x[i][j - 1])

    temp = alpha * temp / m
    theta -= temp

    return theta

In [174]:
theta = []
alpha = 0.3
iterations = 50

for _ in range(n + 1):
    theta.append(random.random())

print(theta)

for _ in range(iterations):
    print(error(theta, x_train, y_train))
    theta = gradient_descent(alpha, theta, x_train, y_train)



[0.470104217474101, 0.6663865885046723, 0.40509443826156954, 0.16133278103094495]
0.6369884554773138
0.3541147578510286
0.26263342051229566
0.22243486652474895
0.20056780548433983
0.1872265723807716
0.17855691565440973
0.17268533339859352
0.16858325912221414
0.16564601308271237
0.16350124681582054
0.16191082115831248
0.16071722887614862
0.1598131247980958
0.15912341303937627
0.15859438046083846
0.15818689679701878
0.15787202818496715
0.15762812437983004
0.15743883220341984
0.1572917079741396
0.15717722801025794
0.15708807048260082
0.15701858651824568
0.15696440598079023
0.15692214076382435
0.15688915972756534
0.15686341691366032
0.15684331977636565
0.15682762771262468
0.1568153736826821
0.1568058035194136
0.15679832884440936
0.15679249048523025
0.15678793001815564
0.15678436761108996
0.15678158475926599
0.15677941082560065
0.1567777125425353
0.15677638582086528
0.1567753493568267
0.15677453964157323
0.1567739070647426
0.15677341287184413
0.15677302678812088
0.15677272516274438
0.156772

In [175]:
# for i in range(m):
#     print(w0 + w1*x_test[i][0] + w2*x_test[i][1] + w3*x_test[i][2], y_test[i])
