In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [2]:
X = np.array([
    [1, 7, 33],
    [1, 4, 41],
    [1, 16, 7],
    [1, 3, 49],
    [1, 21, 5],
    [1, 8, 31]
])
y = np.array([42, 33, 75, 28, 91, 55])
XT = X.T
XT_X = np.dot(XT, X)
XT_X_inv = np.linalg.inv(XT_X)
XT_y = np.dot(XT, y)
beta_hat = np.dot(XT_X_inv, XT_y)
beta_hat


array([33.93210327,  2.7847614 , -0.26441893])

In [3]:
XT_X.shape

(3, 3)

In [4]:
H = np.dot(np.dot(X, XT_X_inv), XT)
H

array([[ 0.23143293,  0.25167585,  0.21178735,  0.14886839, -0.05475543,
         0.21099091],
       [ 0.25167585,  0.31240459,  0.09437844,  0.26627729, -0.14787283,
         0.22313666],
       [ 0.21178735,  0.09437844,  0.70442026, -0.31917435,  0.10446672,
         0.20412159],
       [ 0.14886839,  0.26627729, -0.31917435,  0.61425632,  0.14143492,
         0.14833743],
       [-0.05475543, -0.14787283,  0.10446672,  0.14143492,  0.94039955,
         0.01632707],
       [ 0.21099091,  0.22313666,  0.20412159,  0.14833743,  0.01632707,
         0.19708635]])

In [5]:
n = H.shape[0]
I = np.eye(n)
I_minus_H = I - H
e = np.dot(I_minus_H, y)
e

array([-2.69960842, -1.22997279, -1.63735316, -1.32985996, -0.08999801,
        6.98679233])

In [6]:
SSE = np.dot(e.T, e)
SSE

62.07353819605439

In [7]:
n = len(y)
p = X.shape[1]
sigma_squared = SSE / (n - p)
var_beta_hat = sigma_squared * XT_X_inv
var_beta_hat

array([[ 7.15471135e+02, -3.41589166e+01, -1.35949371e+01],
       [-3.41589166e+01,  1.66166636e+00,  6.44067375e-01],
       [-1.35949371e+01,  6.44067375e-01,  2.62467755e-01]])

In [8]:
xh = np.array([1, 10, 30]).reshape(1, -1)
yh = np.dot(xh, beta_hat)
yh

array([53.8471494])

In [9]:
xh.shape

(1, 3)

In [10]:
xht = xh.T
var_yh = sigma_squared*(xh @ XT_X_inv @ xht)
var_yh

array([[5.4246197]])

In [11]:
var_yh2 = sigma_squared * np.dot(np.dot(xh, XT_X_inv), xht)
var_yh2

array([[5.4246197]])

In [12]:
insurance = pd.read_csv('../data/insurance.csv')
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [13]:
reg = smf.ols('charges~age+bmi+children',data=insurance).fit()
reg.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.12
Model:,OLS,Adj. R-squared:,0.118
Method:,Least Squares,F-statistic:,60.69
Date:,"Thu, 19 Sep 2024",Prob (F-statistic):,8.8e-37
Time:,20:36:51,Log-Likelihood:,-14392.0
No. Observations:,1338,AIC:,28790.0
Df Residuals:,1334,BIC:,28810.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-6916.2433,1757.480,-3.935,0.000,-1.04e+04,-3468.518
age,239.9945,22.289,10.767,0.000,196.269,283.720
bmi,332.0834,51.310,6.472,0.000,231.425,432.741
children,542.8647,258.241,2.102,0.036,36.261,1049.468

0,1,2,3
Omnibus:,325.395,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,603.372
Skew:,1.52,Prob(JB):,9.54e-132
Kurtosis:,4.255,Cond. No.,290.0


In [14]:
sm.stats.anova_lm(reg, typ=1)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
age,1.0,17530190000.0,17530190000.0,135.546341,6.627851e-30
bmi,1.0,5446449000.0,5446449000.0,42.112843,1.211545e-10
children,1.0,571519000.0,571519000.0,4.41908,0.03572625
Residual,1334.0,172526100000.0,129329900.0,,


In [15]:
sm.stats.anova_lm(reg, typ=2)

Unnamed: 0,sum_sq,df,F,PR(>F)
age,14994260000.0,1.0,115.938067,5.533922999999999e-26
bmi,5417280000.0,1.0,41.887301,1.354882e-10
children,571519000.0,1.0,4.41908,0.03572625
Residual,172526100000.0,1334.0,,


In [16]:
column_names = ['rental_rates', 'age', 'operating_expenses', 'vacancy_rate', 'total_square_footage'] 
property = pd.read_csv('../data/property.txt', delim_whitespace=True, header=None, names=column_names)
property

  property = pd.read_csv('../data/property.txt', delim_whitespace=True, header=None, names=column_names)


Unnamed: 0,rental_rates,age,operating_expenses,vacancy_rate,total_square_footage
0,13.50,1,5.02,0.14,123000
1,12.00,14,8.19,0.27,104079
2,10.50,16,3.00,0.00,39998
3,15.00,4,10.70,0.05,57112
4,14.00,11,8.97,0.07,60000
...,...,...,...,...,...
76,14.50,3,10.67,0.00,43404
77,13.50,18,8.60,0.08,59443
78,15.00,15,11.97,0.14,254700
79,15.25,11,11.27,0.03,434746


In [17]:
model =smf.ols('rental_rates~age + operating_expenses + vacancy_rate + total_square_footage',data=property).fit()
model.summary()

0,1,2,3
Dep. Variable:,rental_rates,R-squared:,0.585
Model:,OLS,Adj. R-squared:,0.563
Method:,Least Squares,F-statistic:,26.76
Date:,"Thu, 19 Sep 2024",Prob (F-statistic):,7.27e-14
Time:,20:36:51,Log-Likelihood:,-122.75
No. Observations:,81,AIC:,255.5
Df Residuals:,76,BIC:,267.5
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,12.2006,0.578,21.110,0.000,11.049,13.352
age,-0.1420,0.021,-6.655,0.000,-0.185,-0.100
operating_expenses,0.2820,0.063,4.464,0.000,0.156,0.408
vacancy_rate,0.6193,1.087,0.570,0.570,-1.545,2.784
total_square_footage,7.924e-06,1.38e-06,5.722,0.000,5.17e-06,1.07e-05

0,1,2,3
Omnibus:,1.922,Durbin-Watson:,1.58
Prob(Omnibus):,0.383,Jarque-Bera (JB):,1.301
Skew:,0.148,Prob(JB):,0.522
Kurtosis:,3.545,Cond. No.,1740000.0


In [18]:
yhat = model.fittedvalues.head(6)
yhat

0    14.535672
1    13.513806
2    11.091053
3    15.133568
4    13.686716
5    13.687185
dtype: float64

In [19]:
res = model.resid.head(6)
res

0   -1.035672
1   -1.513806
2   -0.591053
3   -0.133568
4    0.313284
5   -3.187185
dtype: float64

In [20]:
sse = sum(res**2)
sse

13.987708188161431

In [21]:
sigma_squared = sse/(81-5)
sigma_squared

0.1840487919494925

In [22]:
new_data = pd.DataFrame({'const': [1], 
                         'age': [4], 
                         'operating_expenses': [10], 
                         'vacancy_rate': [0.1], 
                         'total_square_footage': [80000]})
pred_interval = model.get_prediction(new_data).summary_frame(alpha=0.10) 
pred_interval

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,15.148495,0.190898,14.830621,15.46637,13.228907,17.068083
