In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [3]:
# Load dataset
df = pd.read_csv("housing.csv")
print("Dataset Loaded:\n", df.head())

Dataset Loaded:
    longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [4]:
df = df.dropna()

In [5]:
# Define Sturge's Rule for binning
def sturges_rule(n):
    return int(np.ceil(1 + np.log2(n)))

In [6]:
# Apply stratified splitting using Sturge’s Rule
num_bins = sturges_rule(len(df))
df["price_bin"] = pd.cut(df["median_house_value"], bins=num_bins, labels=False)

In [7]:
# Define features and target
X = df.drop(columns=["median_house_value", "price_bin"])
y = df["median_house_value"]

In [8]:
# One-hot encode categorical feature 'ocean_proximity'
X = pd.get_dummies(X, drop_first=True)

In [9]:
# Stratified splitting
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=df["price_bin"], random_state=42)
print("Training and validation sets created using stratified split.")

Training and validation sets created using stratified split.


In [10]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [11]:
# Train Ridge and Lasso models
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.1)
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)
print("Models trained successfully.")

Models trained successfully.


In [12]:
# Predictions
y_pred_ridge = ridge.predict(X_val)
y_pred_lasso = lasso.predict(X_val)

In [13]:
# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"{model_name} Evaluation:")
    print(f"MAE: {mae:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}\n")
    return mae, mse, rmse

ridge_results = evaluate_model(y_val, y_pred_ridge, "Ridge Regression")
lasso_results = evaluate_model(y_val, y_pred_lasso, "Lasso Regression")

Ridge Regression Evaluation:
MAE: 50398.52
MSE: 5000621993.92
RMSE: 70715.08

Lasso Regression Evaluation:
MAE: 50399.37
MSE: 5000821787.55
RMSE: 70716.49



In [14]:
# Save model and scaler
joblib.dump(scaler, "scaler.pkl")
joblib.dump(ridge, "ridge_model.pkl")
joblib.dump(lasso, "lasso_model.pkl")
print("Models and scaler saved successfully.")

Models and scaler saved successfully.
