In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# import sleep75 dataset
df_sleep = pd.read_csv('sleep75.csv')
df_sleep.shape # dataset size

## Regression model
Consider a regression **sleep on totwrk & age**
Its specification is 
$$sleep=\beta_0+\beta_1totwrk+\beta_2age+u $$
Here
* sleep is an endogenous/dependent variable
* totwrk & age are exogenous/explanatory varables, predictors

## Matrices of regression design:
* y=sleep
* X=[1, totwrk, age]

In [None]:
y = df_sleep['sleep']
y.head()

In [None]:
X = sm.add_constant(df_sleep[['totwrk', 'age']])
X.head()

## Regression fitting via specification

In [None]:
fitted_regr = smf.ols(formula='sleep~totwrk+age', data=df_sleep).fit()
# estimated coefficients: .params property
fitted_regr.params

## Regression fitting via matrices of regression design
*Remark*: missing='drop' means that we exclude observations with missing values

In [None]:
fitted_regr = sm.OLS(endog=y, exog=X, missing='drop').fit()
fitted_regr.params

## Regression fitting via solving a system of linear equations
Esimated coefficints is the solution of a system of linear equations in matrix form $(X^\top X)\beta=X^\top y$

Its solution $\hat{\beta}_{OLS}=(X^\top X)^{-1}(X^\top y)$ 

In [None]:
# X'X matrix
X.T@X

In [None]:
# X'y matrix
X.T@y

In [None]:
np.linalg.inv(X.T@X)@(X.T@y)

## Goodness-of-Fit
TSS (Toral Sum of Squres)

In [None]:
fitted_regr.centered_tss

ESS (Explained Sum of Squares)

In [None]:
fitted_regr.ess

RSS (Residual Sum of Squares)

In [None]:
fitted_regr.ssr

$R^2$, $R^2_{adj}$

In [None]:
fitted_regr.rsquared, fitted_regr.rsquared_adj

## Fitted values, Residuals, Dependent variable (Actual observations)
Consider indices [0, 3, 78, 197, 401, 561]
Fitted values

In [None]:
ind = [0, 3, 78, 197, 401, 561]
fitted_regr.fittedvalues[ind]

Residuals

In [None]:
fitted_regr.resid[ind]

Observations of the dependent variable

In [None]:
df_sleep['sleep'].iloc[ind]