## Setup



In [47]:
import pandas as pd
import numpy as np 
# Modeling 
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
# data
from sklearn.datasets import fetch_california_housing
# Statsmodel
import statsmodels.api as sm

## Load Dataset & EDA

In [48]:
data = fetch_california_housing(as_frame= True)
df = data.frame.copy()
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [49]:
df.shape

(20640, 9)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


###  Check missing value

In [51]:
df.isnull().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [52]:
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [53]:
df.corr(numeric_only = True)['MedHouseVal'].sort_values(ascending = False).head()

MedHouseVal    1.000000
MedInc         0.688075
AveRooms       0.151948
HouseAge       0.105623
AveOccup      -0.023737
Name: MedHouseVal, dtype: float64

## Train Test & Split

In [54]:
X  = df.drop(columns = ['MedHouseVal'])
y = df['MedHouseVal']
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline OLS

In [55]:
ols = Pipeline(steps=[('scaler', StandardScaler()),
                ('lin', LinearRegression())])
# Fit on train set
ols.fit(X_train,y_train)
# Test on test set
pred = ols.predict(X_test)
# Compute score btw test outcome and its prediction
rsme = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test,pred)
r2 = r2_score(y_test,pred)
print(f"Mean squared error: {rsme}")
print(f"Mean absolute error: {mae}")

print(f"Root squared error: {r2}")


Mean squared error: 0.5558915986952438
Mean absolute error: 0.5332001304956565
Root squared error: 0.5757877060324512


## K-Fold Cross Validation 

In [56]:
cv = KFold(n_splits = 5, shuffle = True, random_state = 42)
cv_rsme = -cross_val_score(ols,X,y,cv =cv, scoring = "neg_root_mean_squared_error")
print(f"cv_rsme mean: {cv_rsme.mean()}")
print(f"cv_rsme std: {cv_rsme.std()}")

cv_rsme mean: 0.7282509142479743
cv_rsme std: 0.014935522116016418


## Lasso with CV 

In [64]:
grid = {'alpha': np.logspace(-3,2,30)}
lasso = Pipeline([('scaler' , StandardScaler()),
                  ('model', Lasso(max_iter =1000, random_state = 42))
])
gs_lasso = GridSearchCV(lasso, {'model__alpha': grid["alpha"]},cv = cv,
                        scoring='neg_root_mean_squared_error', n_jobs=-1)

gs_lasso.fit(X_train,y_train)

pred_lasso = gs_lasso.best_estimator_.predict(X_test)
rmse_lasso = mean_squared_error(y_test, pred_lasso)
r2_lasso   = r2_score(y_test, pred_lasso)
print(f"Lasso mean squared error: {rmse_lasso}")
print(f"Lasso root squared error: {mae}")


Lasso mean squared error: 0.5544913600832686
Lasso root squared error: 0.5332001304956565


## Ridge regression

In [66]:
ridge = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Ridge(random_state=42))
])
gs_ridge = GridSearchCV(ridge, {'model__alpha': grid['alpha']}, cv=cv,
                        scoring='neg_root_mean_squared_error', n_jobs=-1)
gs_ridge.fit(X_train, y_train)

pred_ridge = gs_ridge.best_estimator_.predict(X_test)
rmse_ridge = mean_squared_error(y_test, pred_ridge)
r2_ridge   = r2_score(y_test, pred_ridge)
rmse_ridge, r2_ridge
print(f"Ridge mean squared error: {rmse_ridge}")
print(f"Ridge root squared error: {r2_ridge}")

Ridge mean squared error: 0.5557890717749265
Ridge root squared error: 0.575865946430693
