# California housing dataset (focusing on multicollinearity)

1. Fit an OLS model
1. Assess collinearity
1. Drop columns
1. Combine columns
1. Ridge regression
1. LASSO

---

In [1]:
# import modules

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.outliers_influence \
    import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize, poly)
from scipy import stats


## Fit an OLS model


In [2]:
# read in California housing dataset
from sklearn.datasets import fetch_california_housing
housing=fetch_california_housing()

In [3]:
X=pd.DataFrame(housing.data, columns=housing.feature_names)
y=housing.target # Median house value in $100,000s

In [4]:
model_OLS=sm.OLS(y,X)
results_OLS=model_OLS.fit()
summarize(results_OLS)

Unnamed: 0,coef,std err,t,P>|t|
MedInc,0.5135,0.004,120.594,0.0
HouseAge,0.0157,0.0,33.727,0.0
AveRooms,-0.1825,0.006,-29.673,0.0
AveBedrms,0.8651,0.03,28.927,0.0
Population,8e-06,5e-06,1.53,0.126
AveOccup,-0.0047,0.001,-8.987,0.0
Latitude,-0.0639,0.004,-17.826,0.0
Longitude,-0.0164,0.001,-14.381,0.0


In [6]:
results_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.892
Model:,OLS,Adj. R-squared (uncentered):,0.892
Method:,Least Squares,F-statistic:,21370.0
Date:,"Tue, 26 Aug 2025",Prob (F-statistic):,0.0
Time:,18:13:04,Log-Likelihood:,-24087.0
No. Observations:,20640,AIC:,48190.0
Df Residuals:,20632,BIC:,48250.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
MedInc,0.5135,0.004,120.594,0.000,0.505,0.522
HouseAge,0.0157,0.000,33.727,0.000,0.015,0.017
AveRooms,-0.1825,0.006,-29.673,0.000,-0.195,-0.170
AveBedrms,0.8651,0.030,28.927,0.000,0.806,0.924
Population,7.792e-06,5.09e-06,1.530,0.126,-2.19e-06,1.78e-05
AveOccup,-0.0047,0.001,-8.987,0.000,-0.006,-0.004
Latitude,-0.0639,0.004,-17.826,0.000,-0.071,-0.057
Longitude,-0.0164,0.001,-14.381,0.000,-0.019,-0.014

0,1,2,3
Omnibus:,4353.392,Durbin-Watson:,0.909
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.489
Skew:,1.069,Prob(JB):,0.0
Kurtosis:,6.436,Cond. No.,10300.0
