# Linear Regression Model on Boston Housing Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('housing.csv')

In [3]:
df

Unnamed: 0,RM,LSTAT,PTRATIO,MEDV
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0
2,7.185,4.03,17.8,728700.0
3,6.998,2.94,18.7,701400.0
4,7.147,5.33,18.7,760200.0
...,...,...,...,...
484,6.593,9.67,21.0,470400.0
485,6.120,9.08,21.0,432600.0
486,6.976,5.64,21.0,501900.0
487,6.794,6.48,21.0,462000.0


In [4]:
df.shape

(489, 4)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489 entries, 0 to 488
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   RM       489 non-null    float64
 1   LSTAT    489 non-null    float64
 2   PTRATIO  489 non-null    float64
 3   MEDV     489 non-null    float64
dtypes: float64(4)
memory usage: 15.4 KB


In [6]:
df.isnull().sum()

RM         0
LSTAT      0
PTRATIO    0
MEDV       0
dtype: int64

### Splitting the data

In [7]:
df

Unnamed: 0,RM,LSTAT,PTRATIO,MEDV
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0
2,7.185,4.03,17.8,728700.0
3,6.998,2.94,18.7,701400.0
4,7.147,5.33,18.7,760200.0
...,...,...,...,...
484,6.593,9.67,21.0,470400.0
485,6.120,9.08,21.0,432600.0
486,6.976,5.64,21.0,501900.0
487,6.794,6.48,21.0,462000.0


In [8]:
df.columns

Index(['RM', 'LSTAT', 'PTRATIO', 'MEDV'], dtype='object')

In [9]:
X = df[['RM','LSTAT','PTRATIO']]

In [10]:
X

Unnamed: 0,RM,LSTAT,PTRATIO
0,6.575,4.98,15.3
1,6.421,9.14,17.8
2,7.185,4.03,17.8
3,6.998,2.94,18.7
4,7.147,5.33,18.7
...,...,...,...
484,6.593,9.67,21.0
485,6.120,9.08,21.0
486,6.976,5.64,21.0
487,6.794,6.48,21.0


In [11]:
y = df[['MEDV']]

In [12]:
y

Unnamed: 0,MEDV
0,504000.0
1,453600.0
2,728700.0
3,701400.0
4,760200.0
...,...
484,470400.0
485,432600.0
486,501900.0
487,462000.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
print(f"Rows in train set: {len(X_train)}\nRows in test set: {len(X_test)}")

Rows in train set: 391
Rows in test set: 98


## Training the data

In [15]:
model = LinearRegression()
model.fit(X_train,y_train)

In [16]:
y_pred = model.predict(X_test)

In [17]:
y_pred

array([[342593.79029768],
       [506257.0916297 ],
       [410499.93166174],
       [237792.7411537 ],
       [327005.79653234],
       [403018.068531  ],
       [261060.38389067],
       [701308.47374597],
       [362924.70496746],
       [585818.82333754],
       [456966.23009711],
       [365587.84857713],
       [266036.4241684 ],
       [265799.92818911],
       [385359.28098829],
       [525974.87433762],
       [388922.38353646],
       [365210.2410349 ],
       [365315.35425769],
       [420439.93835104],
       [459794.49010487],
       [461685.28906052],
       [369745.76216645],
       [644034.09840583],
       [467828.26948158],
       [473745.56661447],
       [498572.57258183],
       [634774.91735229],
       [679806.33028785],
       [168957.24703839],
       [514819.05350129],
       [239552.37320321],
       [536885.46626665],
       [508876.38428348],
       [305150.22603695],
       [502246.53271674],
       [633616.8915942 ],
       [498079.88203251],
       [6640

In [18]:
mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error on the test set: {mse:.2f}')

Mean squared error on the test set: 6789025559.27
