In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
%matplotlib inline

In [2]:
# 3.6.2 - simple linear regression
boston = pd.read_csv('../data/boston.csv')
boston.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [3]:
# fit linear regression - R: lm.fit = lm(medv ~ lstat)
y = boston['medv'].to_numpy().reshape(-1, 1)
X = boston['lstat'].to_numpy().reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)

LinearRegression()

In [4]:
# R: lm.fit
print(f'Intercept: {model.intercept_}, Coefficeint (lstat): {model.coef_}')

Intercept: [34.55384088], Coefficeint (lstat): [[-0.95004935]]


In [5]:
# for running summary stats below, I believe data needs to be split
# lr = LinearRegression()
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# lr.fit(X_train, y_train)
y_pred = model.predict(X)

In [6]:
# R: summary(lm.fit)
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    # mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)  # negative error
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))    
    # print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))
regression_results(y, y_pred)

explained_variance:  0.5441
r2:  0.5441
MAE:  4.5053
MSE:  38.483
RMSE:  6.2035


In [7]:
'''
To get a more equivilent report to R: summary(lm.fit), use statsmodels.
There exists no R type regression summary report in sklearn. The main reason is 
that sklearn is used for predictive modelling / machine learning and the evaluation 
criteria are based on performance on previously unseen data (such as predictive r^2 
for regression).
'''
# https://datatofish.com/statsmodels-linear-regression/
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X) 
model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.544
Model:,OLS,Adj. R-squared:,0.543
Method:,Least Squares,F-statistic:,601.6
Date:,"Sat, 05 Mar 2022",Prob (F-statistic):,5.08e-88
Time:,10:08:16,Log-Likelihood:,-1641.5
No. Observations:,506,AIC:,3287.0
Df Residuals:,504,BIC:,3295.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,34.5538,0.563,61.415,0.000,33.448,35.659
x1,-0.9500,0.039,-24.528,0.000,-1.026,-0.874

0,1,2,3
Omnibus:,137.043,Durbin-Watson:,0.892
Prob(Omnibus):,0.0,Jarque-Bera (JB):,291.373
Skew:,1.453,Prob(JB):,5.36e-64
Kurtosis:,5.319,Cond. No.,29.7


# R summary comparison
Data in both summaries:
- Coefficients match Python
- Multiple R-squared is R-squared in Python
- Adjusted R-squard is Adj. R-squared in Python
- F-statistic is F-statistic in Python
    - the "1 and 504 DF" can be found in DF Model and DF Residuals in Python
- p-value is likely under the P>|t| section, but so small it's showing 0 instead of 2.2e-16 in Python

Data missing from Python:
- Residuals (Min, 1Q, Median, 3Q, Max)
- Residual standard error (504 is listed, but not 6.216)
- Signif. codes

```
Residuals:
    Min      1Q  Median      3Q     Max 
-15.168  -3.990  -1.318   2.034  24.500 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.55384    0.56263   61.41   <2e-16 ***
lstat       -0.95005    0.03873  -24.53   <2e-16 ***
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared:  0.5441,	Adjusted R-squared:  0.5432 
F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
```