# Importing the Libraries and Boston Housing Dataset

In [68]:
from sklearn import datasets
import pandas as pd
import numpy as np

In [69]:
bdata = datasets.load_boston()
print(bdata.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [70]:
X = bdata.data
Y = bdata.target

In [71]:
df = pd.DataFrame(X)
df.columns = bdata.feature_names

In [72]:
#Using StandardScaler to scale the data
from sklearn import preprocessing
standardscaler_object = preprocessing.StandardScaler()
standardscaler_object.fit(df)
df = standardscaler_object.transform(df)

In [73]:
df = pd.DataFrame(df)
df.columns = bdata.feature_names
X = df.to_numpy()

# Creating the ones array and forming the dataset

In [74]:
ones = np.ones((506,1),dtype = int)
X = np.array(X)
X = np.hstack((X,ones))
Y = np.array(Y)
Y = Y.reshape(506,1)
data = np.hstack((X,Y))

# Defining the Generic Gradient Descent Function

In [75]:
def gd(dataset,learning_rate,iterations):
    N = len(dataset[0,:])
    slope_vector = np.zeros((N-1,1)) 
    for i in range(iterations):
        slope_vector = step_grad(dataset,learning_rate,slope_vector) 
        print(i,cost(dataset,slope_vector))
    return slope_vector 

In [76]:
def step_grad(dataset,learning_rate,slope_vector):
    M = len(dataset[:,0])
    N = len(dataset[0,:])
    for i in range(M):
        m_slope = np.zeros((N-1,1))
        for j in range(N-1):
            X = dataset[i,0:N-1]
            Y = dataset[i,N-1]
            m_slope[j] = m_slope[j] + (-1)*(2/M)*(Y-np.dot(X,slope_vector))*(X[j])
        slope_vector = slope_vector - learning_rate*m_slope   
    return slope_vector        

In [77]:
def cost(dataset,slope_vector):
    M = len(dataset[:,0])
    N = len(dataset[0,:])
    total_cost = 0
    for i in range(M):
        X = data[i,0:N-1]
        Y = data[i,N-1]
        total_cost = total_cost + (1/M)*((Y-np.dot(X,slope_vector))**2)
    return total_cost   

In [78]:
final_m = gd(data,.01,500)  
final_m

0 [564.45179387]
1 [538.94455406]
2 [515.30778853]
3 [493.28597308]
4 [472.6724059]
5 [453.2989553]
6 [435.02800975]
7 [417.74615403]
8 [401.35919855]
9 [385.78826973]
10 [370.96673263]
11 [356.83776634]
12 [343.35245193]
13 [330.46826252]
14 [318.14786938]
15 [306.35819632]
16 [295.06966916]
17 [284.25561892]
18 [273.89180586]
19 [263.95603874]
20 [254.42786933]
21 [245.28834621]
22 [236.51981546]
23 [228.1057586]
24 [220.03065986]
25 [212.27989703]
26 [204.83965082]
27 [197.69682926]
28 [190.83900393]
29 [184.25435582]
30 [177.931629]
31 [171.86009047]
32 [166.02949516]
33 [160.43005517]
34 [155.05241235]
35 [149.88761386]
36 [144.92709004]
37 [140.16263432]
38 [135.58638485]
39 [131.19080756]
40 [126.9686805]
41 [122.91307927]
42 [119.01736343]
43 [115.27516373]
44 [111.68037013]
45 [108.22712049]
46 [104.90978984]
47 [101.72298033]
48 [98.66151153]
49 [95.72041135]
50 [92.89490734]
51 [90.18041836]
52 [87.57254675]
53 [85.06707072]
54 [82.65993718]
55 [80.34725485]
56 [78.12528765]

464 [22.20119339]
465 [22.20004772]
466 [22.19890761]
467 [22.19777301]
468 [22.19664389]
469 [22.19552023]
470 [22.19440198]
471 [22.19328912]
472 [22.19218162]
473 [22.19107943]
474 [22.18998253]
475 [22.18889089]
476 [22.18780448]
477 [22.18672326]
478 [22.1856472]
479 [22.18457628]
480 [22.18351046]
481 [22.18244971]
482 [22.181394]
483 [22.1803433]
484 [22.17929758]
485 [22.17825682]
486 [22.17722098]
487 [22.17619003]
488 [22.17516395]
489 [22.1741427]
490 [22.17312626]
491 [22.1721146]
492 [22.17110769]
493 [22.1701055]
494 [22.169108]
495 [22.16811518]
496 [22.16712699]
497 [22.16614342]
498 [22.16516443]
499 [22.16419001]


array([[-0.78644087],
       [ 0.80976875],
       [-0.27406482],
       [ 0.74547404],
       [-1.57889486],
       [ 2.87814981],
       [-0.10019098],
       [-2.74266461],
       [ 1.45661793],
       [-0.88102058],
       [-1.95074238],
       [ 0.87355145],
       [-3.65191229],
       [22.52274782]])

# Creating the Predictor function

In [79]:
def Y_pred(test_data,finalslopes):
    test_result = np.dot(test_data,finalslopes)
    return test_result

In [80]:
train_data = np.loadtxt('0000000000002417_training_boston_x_y_train.csv',delimiter=",")

In [81]:
train_data.shape

(379, 14)

In [82]:
train_datacopy = train_data.copy()

In [83]:
#Seperating X and Y
Y_train = train_data[:,13]
X_train = train_data[:,0:13]

In [84]:
#Using StandardScaler to fit the data
standardscaler_object.fit(X_train)
X_train = standardscaler_object.transform(X_train)

In [85]:
ones = np.ones((379,1))
X_train = np.hstack((X_train,ones))

In [86]:
Y_train = np.array(Y_train)
X_train = np.array(X_train)

In [87]:
Y_train = Y_train.reshape((379,1))

In [88]:
trainingdata = np.hstack((X_train,Y_train))

In [89]:
test_data = np.loadtxt('0000000000002417_test_boston_x_test.csv',delimiter = ",")
#Transforming the test data to the above fitted training data
test_data = standardscaler_object.transform(test_data)
ones = np.ones((127,1),dtype = int)
test_data = np.hstack((test_data,ones))

In [90]:
final_m = gd(trainingdata,.1,200)

0 [426.76932086]
1 [283.07053364]
2 [195.75422573]
3 [139.35149676]
4 [102.05660241]
5 [77.14512031]
6 [60.41154801]
7 [49.12321242]
8 [41.47674425]
9 [36.2738952]
10 [32.71556422]
11 [30.26745732]
12 [28.57151184]
13 [27.38720451]
14 [26.55255411]
15 [25.95816016]
16 [25.52989162]
17 [25.2173256]
18 [24.9860139]
19 [24.81230154]
20 [24.67984862]
21 [24.57729103]
22 [24.49666349]
23 [24.43233366]
24 [24.38027963]
25 [24.33759834]
26 [24.30216995]
27 [24.27242735]
28 [24.24719728]
29 [24.22558993]
30 [24.20692176]
31 [24.19066112]
32 [24.17638955]
33 [24.16377398]
34 [24.15254658]
35 [24.14248993]
36 [24.13342602]
37 [24.12520803]
38 [24.11771401]
39 [24.11084202]
40 [24.10450645]
41 [24.09863501]
42 [24.09316637]
43 [24.08804835]
44 [24.08323635]
45 [24.07869214]
46 [24.07438284]
47 [24.07028011]
48 [24.06635946]
49 [24.06259965]
50 [24.05898226]
51 [24.05549126]
52 [24.05211268]
53 [24.04883431]
54 [24.04564551]
55 [24.04253695]
56 [24.03950045]
57 [24.03652886]
58 [24.03361589]
59 [2

In [91]:
Y_answer = Y_pred(test_data,final_m)

In [92]:
Y_answer

array([[12.58794949],
       [28.88706046],
       [22.35038632],
       [24.30007562],
       [20.64909961],
       [ 2.83868484],
       [30.24120637],
       [24.78581889],
       [18.6347076 ],
       [23.46116869],
       [24.00310526],
       [17.70542602],
       [17.43473701],
       [21.6514805 ],
       [42.42999869],
       [23.76964013],
       [24.36019973],
       [27.54443539],
       [20.15938711],
       [31.16189663],
       [23.8718916 ],
       [25.05419037],
       [33.99412779],
       [36.36377704],
       [31.91807241],
       [16.78157062],
       [23.48722357],
       [32.93047502],
       [25.20220828],
       [33.76061749],
       [16.87486674],
       [25.99101486],
       [23.24684833],
       [25.40921577],
       [15.13287122],
       [29.59133129],
       [26.13093403],
       [20.44274561],
       [24.2896368 ],
       [ 9.65535273],
       [ 8.49592565],
       [28.8860176 ],
       [29.47181072],
       [19.82386636],
       [20.40395863],
       [ 3

In [93]:
np.savetxt('prediction_GD.csv',Y_answer,delimiter = ",")