## Housing Price Prediction using Advanced Regression

The solution is divided into the following sections: 
- Data understanding and exploration
- Data cleaning
- Data preparation
- Model building and evaluation

### 1. IMPORT LIBRARIES & LOAD DATA


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

import os

# hide warnings
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('train.csv')

### 2. DATA UNDERSTANDING, PREPARATION & EDA

In [4]:
# 2.1 Data Quality Check: Duplicates
# Check for and remove duplicates to ensure data integrity

print(f"Original shape: {df.shape}")
df.drop_duplicates(inplace=True)
print(f"Shape after duplicate removal: {df.shape}")

Original shape: (1460, 81)
Shape after duplicate removal: (1460, 81)


In [5]:
# 2.2 Missing Value Imputation
# Categorical columns where NA means 'feature not present' (e.g., No Pool)
none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 
             'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 
             'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
             'MasVnrType']
for col in none_cols:
    df[col] = df[col].fillna('None')

In [6]:
# Numeric columns where NA implies 0 (e.g., 0 area)
zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 
             'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
             'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']
for col in zero_cols:
    df[col] = df[col].fillna(0)

# LotFrontage: Impute with median of the specific Neighborhood (Better accuracy than global median)
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# Electrical: Impute with Mode (Most frequent)
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

In [7]:
# 2.3 Feature Engineering (Rubric: New metrics derived)
# Create 'HouseAge' and 'RemodAge' to quantify the age of the property
df['HouseAge'] = df['YrSold'] - df['YearBuilt']
df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']

# Create 'TotalSF' (Total Square Footage) combining Basement, 1st, and 2nd floors
# This often becomes a stronger predictor than individual floor areas
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

# Drop original columns used for engineering or ID to reduce multicollinearity/noise
df = df.drop(['Id', 'YearBuilt', 'YearRemodAdd', 'YrSold'], axis=1)

In [9]:
# 2.4 Data Cleaning & Conversion
# Convert MSSubClass to categorical (nominal data)
df['MSSubClass'] = df['MSSubClass'].astype(str)

# Create Dummy Variables 
# drop_first=True to prevent dummy variable trap
df_encoded = pd.get_dummies(df, drop_first=True)

In [10]:
# Split into X (Predictors) and y (Target)
X = df_encoded.drop('SalePrice', axis=1)
y = df_encoded['SalePrice'] # We can also log-transform y here for better normality

# Train-Test Split (70-30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for readability
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

### 3. MODEL BUILDING & EVALUATION

In [11]:
# 3.1 Ridge Regression (L2 Regularization)
params_ridge = {'alpha': [0.1, 1, 5, 10, 20, 50, 100, 200, 500, 1000]}
ridge = Ridge()
# Using 5-fold Cross Validation to tune hyperparameter
ridge_cv = GridSearchCV(estimator=ridge, param_grid=params_ridge, 
                        scoring='r2', cv=5, verbose=1)
ridge_cv.fit(X_train_scaled, y_train)

print(f"\nBest Alpha for Ridge: {ridge_cv.best_params_['alpha']}")
print(f"Best R2 (Train CV): {ridge_cv.best_score_:.4f}")
print(f"Ridge Test R2 Score: {r2_score(y_test, ridge_cv.predict(X_test_scaled)):.4f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best Alpha for Ridge: 500
Best R2 (Train CV): 0.7744
Ridge Test R2 Score: 0.8595


In [12]:
# 3.2 Lasso Regression (L1 Regularization - Feature Selection)
params_lasso = {'alpha': [10, 50, 100, 500, 1000, 2000]} # Higher alphas usually needed for raw prices
lasso = Lasso(max_iter=10000)
lasso_cv = GridSearchCV(estimator=lasso, param_grid=params_lasso, 
                        scoring='r2', cv=5, verbose=1)
lasso_cv.fit(X_train_scaled, y_train)

print(f"\nBest Alpha for Lasso: {lasso_cv.best_params_['alpha']}")
print(f"Best R2 (Train CV): {lasso_cv.best_score_:.4f}")
print(f"Lasso Test R2 Score: {r2_score(y_test, lasso_cv.predict(X_test_scaled)):.4f}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits

Best Alpha for Lasso: 1000
Best R2 (Train CV): 0.7102
Lasso Test R2 Score: 0.8638


In [13]:
# 3.3 Model Interpretation (Business Aspect)
# Identifying the most significant predictors
lasso_best = lasso_cv.best_estimator_
coefs = pd.Series(lasso_best.coef_, index=X.columns)
top_features = coefs.abs().sort_values(ascending=False).head(10)

print("\n--- Top 10 Significant Variables (Lasso) ---")
print(top_features)


--- Top 10 Significant Variables (Lasso) ---
OverallQual             15678.178093
GrLivArea               14693.698322
PoolQC_Gd               12704.293145
TotalSF                  8844.713229
Neighborhood_NridgHt     7769.423201
GarageCars               6935.142348
PoolArea                 6721.720684
BsmtQual_Gd              6043.946682
Condition2_PosN          5898.280595
Neighborhood_NoRidge     5797.600042
dtype: float64
