# California housing dataset (focusing on multicollinearity)

1. Fit an OLS model
1. Assess collinearity
1. Drop columns
1. Combine columns
1. Ridge regression
1. LASSO

---

In [2]:
# import modules

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.stats.outliers_influence \
    import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize, poly)
from scipy import stats


## Fit an OLS model


In [3]:
# read in California housing dataset
from sklearn.datasets import fetch_california_housing
housing=fetch_california_housing()

In [4]:
X=pd.DataFrame(housing.data, columns=housing.feature_names)
y=housing.target # Median house value in $100,000s

In [90]:
X.describe()

X.mean()
lat_mean=35.631961
long_mean=-119.569704

In [6]:
model_OLS=sm.OLS(y,X)
results_OLS=model_OLS.fit()
summarize(results_OLS)

Unnamed: 0,coef,std err,t,P>|t|
MedInc,0.5135,0.004,120.594,0.0
HouseAge,0.0157,0.0,33.727,0.0
AveRooms,-0.1825,0.006,-29.673,0.0
AveBedrms,0.8651,0.03,28.927,0.0
Population,8e-06,5e-06,1.53,0.126
AveOccup,-0.0047,0.001,-8.987,0.0
Latitude,-0.0639,0.004,-17.826,0.0
Longitude,-0.0164,0.001,-14.381,0.0


In [7]:
results_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.892
Model:,OLS,Adj. R-squared (uncentered):,0.892
Method:,Least Squares,F-statistic:,21370.0
Date:,"Fri, 29 Aug 2025",Prob (F-statistic):,0.0
Time:,16:19:24,Log-Likelihood:,-24087.0
No. Observations:,20640,AIC:,48190.0
Df Residuals:,20632,BIC:,48250.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
MedInc,0.5135,0.004,120.594,0.000,0.505,0.522
HouseAge,0.0157,0.000,33.727,0.000,0.015,0.017
AveRooms,-0.1825,0.006,-29.673,0.000,-0.195,-0.170
AveBedrms,0.8651,0.030,28.927,0.000,0.806,0.924
Population,7.792e-06,5.09e-06,1.530,0.126,-2.19e-06,1.78e-05
AveOccup,-0.0047,0.001,-8.987,0.000,-0.006,-0.004
Latitude,-0.0639,0.004,-17.826,0.000,-0.071,-0.057
Longitude,-0.0164,0.001,-14.381,0.000,-0.019,-0.014

0,1,2,3
Omnibus:,4353.392,Durbin-Watson:,0.909
Prob(Omnibus):,0.0,Jarque-Bera (JB):,14087.489
Skew:,1.069,Prob(JB):,0.0
Kurtosis:,6.436,Cond. No.,10300.0


In [19]:
vals=[VIF(X,i) # VIF() takes in one df/arr and one var col index
      for i in range(0, X.shape[1])]
vif=pd.DataFrame({'vif':vals},
                 index=X.columns[0:])
vif

Unnamed: 0,vif
MedInc,11.51114
HouseAge,7.195917
AveRooms,45.993601
AveBedrms,43.590314
Population,2.935745
AveOccup,1.095243
Latitude,559.874071
Longitude,633.711654


## Handling collinearity

In [91]:
lat=lat_mean
long=long_mean

newX=X.drop(columns="AveBedrms")
newX.loc[:,'SF_Dist']=((newX.loc[:,'Latitude']-lat)**2+(newX.loc[:,'Longitude']-long)**2)**0.5
newX=newX.drop(columns=['Latitude','Longitude'])

model_OLS2=sm.OLS(y,newX)
results_OLS2=model_OLS2.fit()
summarize(results_OLS2)

Unnamed: 0,MedInc,HouseAge,AveRooms,Population,AveOccup,SF_Dist
0,8.3252,41.0,6.984127,322.0,2.555556,3.482938
1,8.3014,21.0,6.238137,2401.0,2.109842,3.462402
2,7.2574,52.0,8.288136,496.0,2.802260,3.471337
3,5.6431,52.0,5.817352,558.0,2.547945,3.479035
4,3.8462,52.0,6.281853,565.0,2.181467,3.479035
...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,845.0,2.560606,4.137476
20636,2.5568,18.0,6.114035,356.0,3.122807,4.192259
20637,1.7000,17.0,5.205543,1007.0,2.325635,4.141084
20638,1.8672,18.0,5.329513,741.0,2.123209,4.181942


In [93]:
results_OLS2.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.885
Model:,OLS,Adj. R-squared (uncentered):,0.885
Method:,Least Squares,F-statistic:,26440.0
Date:,"Fri, 29 Aug 2025",Prob (F-statistic):,0.0
Time:,17:54:05,Log-Likelihood:,-24774.0
No. Observations:,20640,AIC:,49560.0
Df Residuals:,20634,BIC:,49610.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
MedInc,0.4437,0.003,149.919,0.000,0.438,0.449
HouseAge,0.0180,0.000,49.747,0.000,0.017,0.019
AveRooms,-0.0233,0.002,-10.245,0.000,-0.028,-0.019
Population,3.033e-05,4.63e-06,6.553,0.000,2.13e-05,3.94e-05
AveOccup,-0.0047,0.001,-8.650,0.000,-0.006,-0.004
SF_Dist,-0.0252,0.005,-5.402,0.000,-0.034,-0.016

0,1,2,3
Omnibus:,4377.539,Durbin-Watson:,0.812
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11374.515
Skew:,1.153,Prob(JB):,0.0
Kurtosis:,5.813,Cond. No.,1550.0


In [94]:
vals=[VIF(newX,i) # VIF() takes in one df/arr and one var col index
      for i in range(0, newX.shape[1])]
vif=pd.DataFrame({'vif':vals},
                 index=newX.columns[0:])

vif

Unnamed: 0,vif
MedInc,5.203287
HouseAge,4.102601
AveRooms,5.865917
Population,2.269656
AveOccup,1.094622
SF_Dist,5.95508


In [95]:
results_OLS.bse

MedInc        0.004258
HouseAge      0.000464
AveRooms      0.006151
AveBedrms     0.029906
Population    0.000005
AveOccup      0.000523
Latitude      0.003587
Longitude     0.001139
dtype: float64

In [96]:
results_OLS2.bse

MedInc        0.002960
HouseAge      0.000362
AveRooms      0.002271
Population    0.000005
AveOccup      0.000540
SF_Dist       0.004662
dtype: float64

In [97]:
results_OLS.params

MedInc        0.513515
HouseAge      0.015651
AveRooms     -0.182528
AveBedrms     0.865099
Population    0.000008
AveOccup     -0.004699
Latitude     -0.063946
Longitude    -0.016383
dtype: float64

In [98]:
results_OLS2.params

MedInc        0.443695
HouseAge      0.018019
AveRooms     -0.023266
Population    0.000030
AveOccup     -0.004675
SF_Dist      -0.025183
dtype: float64