# Multiple Linear Regression For Boston Housing Dataset

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/Yasmeenmad/data_science_bootcamp/main/Week11/Linear_Regression/BostonHousing.csv")

In [3]:
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [4]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
crim,1.0,-0.200469,0.406583,-0.055892,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,-0.385064,0.455621,-0.388305
zn,-0.200469,1.0,-0.533828,-0.042697,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,0.17552,-0.412995,0.360445
indus,0.406583,-0.533828,1.0,0.062938,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,-0.356977,0.6038,-0.483725
chas,-0.055892,-0.042697,0.062938,1.0,0.091203,0.091251,0.086518,-0.099176,-0.007368,-0.035587,-0.121515,0.048788,-0.053929,0.17526
nox,0.420972,-0.516604,0.763651,0.091203,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.590879,-0.427321
rm,-0.219247,0.311991,-0.391676,0.091251,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.613808,0.69536
age,0.352734,-0.569537,0.644779,0.086518,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,-0.273534,0.602339,-0.376955
dis,-0.37967,0.664408,-0.708027,-0.099176,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.496996,0.249929
rad,0.625505,-0.311948,0.595129,-0.007368,0.611441,-0.209847,0.456022,-0.494588,1.0,0.910228,0.464741,-0.444413,0.488676,-0.381626
tax,0.582764,-0.314563,0.72076,-0.035587,0.668023,-0.292048,0.506456,-0.534432,0.910228,1.0,0.460853,-0.441808,0.543993,-0.468536


Train / test split the data...

In [5]:
vars = ['rm', 'lstat']

target = 'medv'
X = df[vars]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=2)

In [6]:
X_train

Unnamed: 0,rm,lstat
312,6.023,11.72
328,5.868,9.97
251,6.438,3.59
205,5.891,10.87
231,7.412,5.25
...,...,...
22,6.142,18.72
72,6.065,5.52
493,5.707,12.01
15,5.834,8.47


In [7]:
y_test

463    20.2
152    15.3
291    37.3
183    32.5
384     8.8
       ... 
160    27.0
438     8.4
282    46.0
294    21.7
7      27.1
Name: medv, Length: 127, dtype: float64

In [8]:
multi_lr = LinearRegression()

In [9]:
multi_lr.fit(X_train, y_train)

LinearRegression()

Find out a little bit about it...

In [10]:
multi_lr.intercept_

1.0355165535077724

In [11]:
multi_lr.coef_

array([ 4.74338803, -0.66459738])

Make some predictions...

In [12]:
multi_predictions = multi_lr.predict(X_test)

Look at some preds and actuals side-by-side...

In [13]:
multi_predictions[-5:]

array([27.02640614,  6.57792172, 35.29827991, 22.62672248, 17.58466768])

In [14]:
y_test.tail()

160    27.0
438     8.4
282    46.0
294    21.7
7      27.1
Name: medv, dtype: float64

How'd we score?

In [15]:
r2_score(y_true=y_test, y_pred=multi_predictions)

0.687388881816742

In [16]:
mean_absolute_error(y_true=y_test, y_pred=multi_predictions)

3.6905227495488204

In [17]:
y_test.mean()

22.93070866141732

In [18]:
y_test.std()

9.45810739659681

The columns I chose were these two columns `rm` and `lstat`, because I found that they have the highest correlation with the `medv` column.

And the first r2_score was: $0.7188657509609544$, and when we changed the columns the r2_score went down to $0.687388881816742$, Also the mean_absolute_error was: $4.363991882706358$, and became $3.6905227495488204$.

We notice that the values have decreased and this is a good thing