In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sb
import plotly.express as px
plt.style.use('default')

> 1) Read dataset

In [10]:
boston = pd.read_csv("D:\\PROGRAMMING\\Datasets\\Boston.csv")
boston.head()

Unnamed: 0.1,Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


> 2) Find dependent and independent variables

In [11]:
X = pd.DataFrame(boston.iloc[:, :-1])
y = pd.DataFrame(boston.iloc[:, -1])

> 3. Check for the significance

In [12]:
# The inclusion of a constant allows the regression line to have an intercept point with 
# the y-axis, even when all independent variables are zero.

In [13]:
##level of significance
alpha = 0.05

## Add constant to the independent variable
X = sm.add_constant(X)

sig_est = sm.OLS(y, X)## OLS( Ordinary Least Square )
result = sig_est.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                   medv   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.734
Method:                 Least Squares   F-statistic:                     100.6
Date:                Tue, 11 Jul 2023   Prob (F-statistic):          3.44e-134
Time:                        19:26:22   Log-Likelihood:                -1498.0
No. Observations:                 506   AIC:                             3026.
Df Residuals:                     491   BIC:                             3089.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         36.4614      5.101      7.148      0.0

In [14]:
##Checking for the simple linear regression
check = sm.OLS(boston['medv'] , boston['age']).fit()
check.summary()

0,1,2,3
Dep. Variable:,medv,R-squared (uncentered):,0.644
Model:,OLS,Adj. R-squared (uncentered):,0.644
Method:,Least Squares,F-statistic:,915.1
Date:,"Tue, 11 Jul 2023",Prob (F-statistic):,1.85e-115
Time:,19:26:22,Log-Likelihood:,-2071.5
No. Observations:,506,AIC:,4145.0
Df Residuals:,505,BIC:,4149.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,0.2636,0.009,30.250,0.000,0.246,0.281

0,1,2,3
Omnibus:,27.739,Durbin-Watson:,0.357
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19.564
Skew:,0.369,Prob(JB):,5.65e-05
Kurtosis:,2.38,Cond. No.,1.0


> 4. Dropping the necessary columns

In [15]:
## unnamed:0,Indus and Age are statistically insignificant for we accept null hypothesis in this
## case and reject the alternate hypothsis, and we will drop those attributes from our model

X = X.drop(["Unnamed: 0", "indus", "age"], axis = 1)
X.head()

Unnamed: 0,const,crim,zn,chas,nox,rm,dis,rad,tax,ptratio,black,lstat
0,1.0,0.00632,18.0,0,0.538,6.575,4.09,1,296,15.3,396.9,4.98
1,1.0,0.02731,0.0,0,0.469,6.421,4.9671,2,242,17.8,396.9,9.14
2,1.0,0.02729,0.0,0,0.469,7.185,4.9671,2,242,17.8,392.83,4.03
3,1.0,0.03237,0.0,0,0.458,6.998,6.0622,3,222,18.7,394.63,2.94
4,1.0,0.06905,0.0,0,0.458,7.147,6.0622,3,222,18.7,396.9,5.33


> 5. Find the coefficients | p_values | Confidence Interval

In [16]:
print("\nThe Coefficiencts are : ")
result.params


The Coefficiencts are : 


const         36.461352
Unnamed: 0    -0.002526
crim          -0.108762
zn             0.048031
indus          0.019932
chas           2.705245
nox          -17.541602
rm             3.839225
age           -0.001938
dis           -1.493304
rad            0.324925
tax           -0.011598
ptratio       -0.947985
black          0.009357
lstat         -0.526184
dtype: float64

In [17]:
print("The P-Values are: ")
result.pvalues


The P-Values are: 


const         3.209691e-12
Unnamed: 0    2.250457e-01
crim          1.000250e-03
zn            5.375059e-04
indus         7.458713e-01
chas          1.785946e-03
nox           5.658365e-06
rm            1.245587e-18
age           8.848664e-01
dis           3.682773e-13
rad           2.426287e-06
tax           2.443267e-03
ptratio       1.670700e-12
black         5.364596e-04
lstat         6.050328e-23
dtype: float64

In [18]:
print("Confidence Intervals are: ")
result.conf_int()


Confidence Intervals are: 


Unnamed: 0,0,1
const,26.438882,46.483822
Unnamed: 0,-0.006612,0.00156
crim,-0.173316,-0.044209
zn,0.020946,0.075115
indus,-0.100841,0.140705
chas,1.01296,4.397531
nox,-25.051861,-10.031344
rm,3.017106,4.661344
age,-0.028227,0.02435
dis,-1.886054,-1.100554


> 6. Train Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(404, 12)
(102, 12)
(404, 1)
(102, 1)


7. Training Our Model

In [20]:
mlr = LinearRegression()
mlr.fit(X_train, y_train)

8. Predicting Value

In [21]:
y_predict = mlr.predict(X_test)


final = pd.DataFrame({'Actual':y_test.values.flatten(), 'Predicted':y_predict.flatten()})
final

Unnamed: 0,Actual,Predicted
0,34.6,34.496490
1,31.5,30.868682
2,20.6,22.304769
3,14.5,18.131193
4,16.2,20.541658
...,...,...
97,50.0,36.370316
98,7.2,18.015547
99,50.0,23.490485
100,14.0,13.702219


In [22]:
px.scatter(final, 'Actual', 'Predicted', trendline = 'ols', trendline_color_override='blue')

In [23]:
px.scatter_3d(final, 'Actual', 'Predicted', final['Actual']-final['Predicted'])

9. Necessary observations

In [24]:
mae = mean_absolute_error(y_test, y_predict)
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_predict)

n = len(y_test)     ## no of samples
p = X_test.shape[1] ## no of predictors
adjusted_r2 = 1 - ( 1 - r2 )*( n - 1) / n - p - 1


print("mean_absolute_error is: ", mae)
print("mean_squared_error is : ", mse)
print("root_mean_squared_error is : ", rmse)
print("r square is  : ", r2)
print("adjusted r square is  : ", adjusted_r2)## it decreases as  features/predictors increase

mean_absolute_error is:  3.2518545636225586
mean_squared_error is :  23.425938278313655
root_mean_squared_error is :  4.840034945980623
r square is  :  0.7574812283240356
adjusted r square is  :  -12.240141136659533
