In [202]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [203]:
data = pd.read_csv("Boston-Housing.csv")
data

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
4,7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43,22.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,500,0.17783,0.0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.10,17.5
329,502,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
330,503,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
331,504,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9


In [204]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ID       333 non-null    int64  
 1   crim     333 non-null    float64
 2   zn       333 non-null    float64
 3   indus    333 non-null    float64
 4   chas     333 non-null    int64  
 5   nox      333 non-null    float64
 6   rm       333 non-null    float64
 7   age      333 non-null    float64
 8   dis      333 non-null    float64
 9   rad      333 non-null    int64  
 10  tax      333 non-null    int64  
 11  ptratio  333 non-null    float64
 12  black    333 non-null    float64
 13  lstat    333 non-null    float64
 14  medv     333 non-null    float64
dtypes: float64(11), int64(4)
memory usage: 39.2 KB


In [205]:
data.drop("ID", axis=1)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
4,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43,22.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,0.17783,0.0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.10,17.5
329,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
330,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
331,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9


In [206]:
y = data["medv"].values.reshape(-1, 1)
x = data.drop("medv", axis=1)

In [207]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [208]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((249, 14), (84, 14), (249, 1), (84, 1))

In [209]:
scaler = MinMaxScaler()
x_train_sc = scaler.fit_transform(x_train)
x_test_sc = scaler.transform(x_test)


In [210]:
x_train_b = np.concatenate((np.ones((x_train_sc.shape[0], 1)), x_train_sc), axis=1)
x_test_b = np.concatenate((np.ones((x_test_sc.shape[0], 1)), x_test_sc), axis=1)

In [211]:
x_train

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
164,241,0.11329,30.0,4.93,0,0.428,6.897,54.3,6.3361,6,300,16.6,391.25,11.38
233,353,0.07244,60.0,1.69,0,0.411,5.884,18.5,10.7103,4,411,18.3,392.33,7.79
208,312,0.79041,0.0,9.90,0,0.544,6.122,52.8,2.6403,4,304,18.4,396.90,5.98
5,11,0.22489,12.5,7.87,0,0.524,6.377,94.3,6.3467,5,311,15.2,392.52,20.45
33,54,0.04981,21.0,5.64,0,0.439,5.998,21.4,6.8147,4,243,16.8,396.90,8.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,491,0.20746,0.0,27.74,0,0.609,5.093,98.0,1.8226,4,711,20.1,318.43,29.68
192,287,0.01965,80.0,1.76,0,0.385,6.230,31.5,9.0892,1,241,18.2,341.60,12.93
117,172,2.31390,0.0,19.58,0,0.605,5.880,97.3,2.3887,5,403,14.7,348.13,12.03
47,71,0.08826,0.0,10.81,0,0.413,6.417,6.6,5.2873,4,305,19.2,383.73,6.72


In [212]:
x_test_sc

array([[1.64356436e-01, 7.61237048e-04, 2.63157895e-01, ...,
        7.44186047e-01, 9.84087443e-01, 1.74939467e-01],
       [1.58415842e-01, 9.07799303e-04, 2.63157895e-01, ...,
        7.44186047e-01, 1.00000000e+00, 1.07748184e-01],
       [8.47524752e-01, 1.91959782e-01, 0.00000000e+00, ...,
        8.83720930e-01, 2.36985257e-01, 5.98970944e-01],
       ...,
       [7.60396040e-01, 5.23623123e-01, 0.00000000e+00, ...,
        8.83720930e-01, 7.17666497e-01, 8.74697337e-01],
       [7.70297030e-01, 2.12421907e-01, 0.00000000e+00, ...,
        8.83720930e-01, 1.00000000e+00, 5.78692494e-01],
       [9.84158416e-01, 6.83418228e-03, 0.00000000e+00, ...,
        7.67441860e-01, 1.00000000e+00, 3.74394673e-01]], shape=(84, 14))

In [213]:
beta = np.linalg.inv(x_train_b.T @ x_train_b) @ x_train_b.T @ y_train

In [214]:
y_pred = x_test_b @ beta
y_pred

array([[25.89879214],
       [29.53271123],
       [13.98553687],
       [13.51256798],
       [22.18084623],
       [25.80430101],
       [25.7037509 ],
       [17.47896926],
       [25.19533721],
       [18.17784104],
       [16.62332984],
       [26.13533165],
       [31.35689482],
       [37.53357401],
       [15.90525007],
       [34.31057432],
       [20.28087954],
       [25.38448079],
       [24.64679273],
       [24.68440008],
       [20.19305131],
       [20.60658434],
       [19.31541588],
       [ 3.59565265],
       [ 2.93078499],
       [21.18101312],
       [27.72412433],
       [40.85451358],
       [23.69854859],
       [24.6369089 ],
       [13.93186568],
       [35.45500948],
       [35.46174979],
       [19.80324085],
       [24.17571485],
       [11.57077251],
       [24.3336598 ],
       [ 7.48593847],
       [19.93883545],
       [27.37498087],
       [21.20061751],
       [26.47417144],
       [27.87093194],
       [20.72300638],
       [21.575796  ],
       [30

In [215]:
y_test

array([[22.9],
       [28. ],
       [11. ],
       [15.4],
       [22.5],
       [23.4],
       [23.9],
       [14.3],
       [25. ],
       [20.3],
       [13.8],
       [18.5],
       [31.1],
       [48.3],
       [15.2],
       [34.9],
       [19.3],
       [23.2],
       [24.4],
       [19.2],
       [19.4],
       [21.5],
       [16.4],
       [ 7.4],
       [ 8.8],
       [20.7],
       [23.9],
       [48.5],
       [21.2],
       [21.4],
       [13.4],
       [43.1],
       [27. ],
       [16.2],
       [20. ],
       [15.2],
       [26.2],
       [17.8],
       [24.3],
       [22.6],
       [21.4],
       [23.9],
       [20.6],
       [21.7],
       [18.9],
       [25. ],
       [26.5],
       [30.8],
       [18.7],
       [19.4],
       [48.8],
       [24.7],
       [20.6],
       [14.9],
       [22.2],
       [22. ],
       [17.4],
       [20.2],
       [31.6],
       [23.7],
       [29.9],
       [50. ],
       [17.4],
       [12.6],
       [14.8],
       [24.4],
       [50

In [216]:
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
rmse

np.float64(5.154689553858143)

In [217]:
# model = LinearRegression() 
# model.fit(x_train_sc, y_train)

In [218]:
# y_pred_test = model.predict(x_test_sc) 
# np.round(y_pred_test, 2)

In [219]:
# mse = mean_squared_error(y_test, y_pred_test)
# mse

In [220]:
# rmse = np.sqrt(mse)
# rmse

In [221]:
test = pd.read_csv("Boston-Housing-test.csv")
test

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
1,6,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
2,8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15
3,9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
4,10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,496,0.17899,0.0,9.69,0,0.585,5.670,28.8,2.7986,6,391,19.2,393.29,17.60
169,497,0.28960,0.0,9.69,0,0.585,5.390,72.9,2.7986,6,391,19.2,396.90,21.14
170,499,0.23912,0.0,9.69,0,0.585,6.019,65.3,2.4091,6,391,19.2,396.90,12.92
171,501,0.22438,0.0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.90,14.33


In [222]:
test.drop("ID", axis=1)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
1,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
2,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15
3,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
4,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,0.17899,0.0,9.69,0,0.585,5.670,28.8,2.7986,6,391,19.2,393.29,17.60
169,0.28960,0.0,9.69,0,0.585,5.390,72.9,2.7986,6,391,19.2,396.90,21.14
170,0.23912,0.0,9.69,0,0.585,6.019,65.3,2.4091,6,391,19.2,396.90,12.92
171,0.22438,0.0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.90,14.33


In [223]:
test_sc = scaler.transform(test)

In [224]:
test_b = np.concatenate((np.ones((test.shape[0], 1)), test), axis=1)

In [225]:
y_pred = test_b @ beta
y_pred

array([[   85.65097635],
       [  122.65972567],
       [ -574.06472336],
       [ -871.26449198],
       [ -583.47305802],
       [ -593.64366644],
       [ -502.75664796],
       [ -628.95303447],
       [ -988.62043077],
       [ -673.10573399],
       [ -577.61322975],
       [ -584.91833658],
       [-1520.2714704 ],
       [ -836.00370507],
       [ -313.32808451],
       [ -429.49398724],
       [ -303.00091013],
       [  -31.11822783],
       [ -641.09108032],
       [  -20.50945912],
       [ -340.1714513 ],
       [ -292.93469603],
       [ -752.19810538],
       [ -656.81625726],
       [-1177.39402846],
       [-1066.37136377],
       [ -337.24731291],
       [ -430.22088657],
       [ -283.340526  ],
       [ -576.4395635 ],
       [ -356.97899365],
       [ -353.89369905],
       [ -416.25888349],
       [-1157.38896717],
       [-1250.88419582],
       [-1184.20787031],
       [-1488.58626275],
       [-1504.67913109],
       [-1678.99574755],
       [ -224.54994389],


In [227]:
submission_example = pd.read_csv("Boston-Housing-submission_example.csv")

In [229]:
submission_example["medv"] = y_pred

In [230]:
submission_example

Unnamed: 0,ID,medv
0,3,85.650976
1,6,122.659726
2,8,-574.064723
3,9,-871.264492
4,10,-583.473058
...,...,...
168,496,-2498.892413
169,497,-2583.690861
170,499,-2386.109697
171,501,-2428.325445


In [231]:
submission_example.to_csv("Boston-Housing-Submition.csv", index=False)