# Surprise Housing Price Prediction – Ridge & Lasso Regression
# 1. Import required libraries

In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#include to stop the warning mesages
import warnings
warnings.filterwarnings('ignore')

#load the dataset
data = pd.read_csv('housing_data.csv')

# 2. Load and inspect the dataset

In [88]:
housing = pd.read_csv('train.csv')
# Drop ID column (not useful for prediction)
housing = housing.drop('Id', axis=1)
print(housing.info())
housing.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


# 3. Handle missing values
# --------------------------------------------------
# Numerical variables -> median
# Categorical variables -> mode

In [73]:
num_cols = housing.select_dtypes(include=['int64', 'float64']).columns
cat_cols = housing.select_dtypes(include=['object']).columns

for col in num_cols:
    housing[col] = housing[col].fillna(housing[col].median())

for col in cat_cols:
    housing[col] = housing[col].fillna(housing[col].mode()[0])

# 4. Create dummy variables

In [74]:
housing_dummies = pd.get_dummies(housing, drop_first=True)

# 5. Train-test split

In [75]:
X = housing_dummies.drop('SalePrice', axis=1)
y = housing_dummies['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# 6. Feature scaling (mandatory for Ridge & Lasso)

In [76]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7. Ridge Regression with hyperparameter tuning

In [86]:
ridge = Ridge()
params_ridge = {'alpha': [10.0, 50, 100, 500, 1000, 2000, 3000, 4000, 5000 ]}

ridge_cv = GridSearchCV(
    ridge,
    params_ridge,
    scoring='neg_mean_squared_error',
    cv=5
)

ridge_cv.fit(X_train_scaled, y_train)
best_ridge = ridge_cv.best_estimator_

# Ridge predictions
y_train_pred_ridge = best_ridge.predict(X_train_scaled)
y_test_pred_ridge = best_ridge.predict(X_test_scaled)

print('Ridge Optimal Alpha:', ridge_cv.best_params_['alpha'])
print('Ridge Train RMSE:', np.sqrt(mean_squared_error(y_train, y_train_pred_ridge)))
print('Ridge Test RMSE:', np.sqrt(mean_squared_error(y_test, y_test_pred_ridge)))
print('Ridge Test R2:', r2_score(y_test, y_test_pred_ridge))

Ridge Optimal Alpha: 500
Ridge Train RMSE: 26622.707398815364
Ridge Test RMSE: 31271.824507367804
Ridge Test R2: 0.8598575712046648


# 8. Lasso Regression with hyperparameter tuning

In [85]:

lasso = Lasso(max_iter=20000)

params_lasso = {'alpha': [10.0, 50, 100, 500, 1000, 2000, 3000, 4000, 5000]}

lasso_cv = GridSearchCV(estimator = lasso,
                        param_grid = params_lasso,
                        scoring= 'neg_mean_absolute_error',
                        cv = 5,
                        return_train_score=True,
                        verbose = 1)

lasso_cv.fit(X_train_scaled, y_train)
best_lasso = lasso_cv.best_estimator_

# Lasso predictions
y_train_pred_lasso = best_lasso.predict(X_train_scaled)
y_test_pred_lasso = best_lasso.predict(X_test_scaled)

print('Lasso Optimal Alpha:', lasso_cv.best_params_['alpha'])
print('Lasso Train RMSE:', np.sqrt(mean_squared_error(y_train, y_train_pred_lasso)))
print('Lasso Test RMSE:', np.sqrt(mean_squared_error(y_test, y_test_pred_lasso)))
print('Lasso Test R2:', r2_score(y_test, y_test_pred_lasso))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Lasso Optimal Alpha: 500
Lasso Train RMSE: 23581.44997217293
Lasso Test RMSE: 29469.05618527235
Lasso Test R2: 0.8755497844824255


# 9. Important predictors from Lasso

In [87]:
lasso_coeff = pd.Series(best_lasso.coef_, index=X.columns)
important_features = lasso_coeff[lasso_coeff != 0].sort_values(key=abs, ascending=False)

important_features.head(10)

GrLivArea           20177.125316
OverallQual         14418.219954
RoofMatl_CompShg    12439.586285
BsmtQual_Gd        -10818.900742
BsmtQual_TA        -10683.331889
RoofMatl_WdShngl     8576.139753
RoofMatl_Tar&Grv     8281.410733
PoolQC_Gd           -8213.635544
KitchenQual_TA      -8190.913734
MSSubClass          -7926.159260
dtype: float64

# 10. Effect of increasing alpha (regularization strength)

In [None]:
lasso_double_alpha = Lasso(alpha=2 * lasso_cv.best_params_['alpha'], max_iter=5000)
lasso_double_alpha.fit(X_train_scaled, y_train)

coeff_double_alpha = pd.Series(lasso_double_alpha.coef_, index=X.columns)
coeff_double_alpha[coeff_double_alpha != 0].sort_values(key=abs, ascending=False).head(10)

GrLivArea               19678.502595
OverallQual             17055.415518
MSSubClass              -8441.135494
Neighborhood_NridgHt     8208.404682
GarageCars               7906.268894
BsmtQual_Gd             -7130.585141
Neighborhood_NoRidge     6528.194246
BsmtQual_TA             -6455.951048
PoolQC_Gd               -6187.738324
Neighborhood_StoneBr     5953.616261
dtype: float64