### Experiments: Housing Price Estimation

### Import packages

In [1]:
import pandas as pd
import mlflow
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

### Data ingestion

In [2]:
df=pd.read_csv("../data/Housing.csv")

In [3]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7229300521,20141013T000000,231300.0,2,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [4]:
df.shape

(21613, 21)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

### Select useful features

In [67]:

selected_columns=['grade', 'zipcode', 'lat', 'long', 'sqft_living', 'sqft_living15', 'sqft_lot', 'sqft_lot15', 'sqft_above', 'sqft_basement', 'bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'yr_built', 'yr_renovated']

X=df[selected_columns]

In [68]:
X.head()

Unnamed: 0,grade,zipcode,lat,long,sqft_living,sqft_living15,sqft_lot,sqft_lot15,sqft_above,sqft_basement,bedrooms,bathrooms,floors,waterfront,view,condition,yr_built,yr_renovated
0,7,98178,47.5112,-122.257,1180,1340,5650,5650,1180,0,2,1.0,1.0,0,0,3,1955,0
1,7,98125,47.721,-122.319,2570,1690,7242,7639,2170,400,3,2.25,2.0,0,0,3,1951,1991
2,6,98028,47.7379,-122.233,770,2720,10000,8062,770,0,2,1.0,1.0,0,0,3,1933,0
3,7,98136,47.5208,-122.393,1960,1360,5000,5000,1050,910,4,3.0,1.0,0,0,5,1965,0
4,8,98074,47.6168,-122.045,1680,1800,8080,7503,1680,0,3,2.0,1.0,0,0,3,1987,0


### Select target 

In [69]:
Y=df["price"]

In [70]:
Y.head()

0    231300.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64

### Train-Test split

In [71]:
x_train,x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [72]:
print(x_train.shape, x_test.shape)

(17290, 18) (4323, 18)


### Standard Scaling

In [73]:
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### Linear Regression Model training and evaluation

In [74]:
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# ===== Set MLflow tracking URI =====
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# ===== Single experiment for all models =====
experiment_name = "Housing_Price_Experiments"
mlflow.set_experiment(experiment_name)
client = MlflowClient()
experiment = client.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id

# ===== Determine next run number for Linear Regression =====
runs = client.search_runs([experiment_id], filter_string="tags.model_type='linear_regression'")
next_id = len(runs) + 1

# ===== Model training =====
params = {"fit_intercept": True} 
model = LinearRegression(**params)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# ===== Evaluation =====
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

metrics = {"rmse": rmse, "r2_score": r2}

print(f"RMSE: {rmse:.4f}")
print(f"R¬≤ Score: {r2:.4f}")

# ===== MLflow Logging & Register Model =====
run_name = f"run{next_id}_linear_regression"
artifact_path = f"linear_regression_{next_id}"
model_registry_name = "linear_regression_model"

with mlflow.start_run(run_name=run_name) as run:
    mlflow.set_tag("model_type", "linear_regression")
    mlflow.log_params(params)
    mlflow.log_param("selected_columns", selected_columns)
    mlflow.log_metrics(metrics)
    
    mlflow.sklearn.log_model(
        sk_model=model,
        name=artifact_path,
        registered_model_name=model_registry_name,
        input_example=x_test[:5]
    )

print(f"‚úÖ {run_name} completed. Run ID: {run.info.run_id}, Artifact path: {artifact_path}")
print(f"‚úÖ Model registered as '{model_registry_name}' in MLflow Model Registry")


RMSE: 212539.4719
R¬≤ Score: 0.7012


Registered model 'linear_regression_model' already exists. Creating a new version of this model...
2025/10/05 21:33:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: linear_regression_model, version 9


üèÉ View run run8_linear_regression at: http://127.0.0.1:5000/#/experiments/306186249389733772/runs/c49c6a13524741cab0b8c1d49288faf6
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/306186249389733772
‚úÖ run8_linear_regression completed. Run ID: c49c6a13524741cab0b8c1d49288faf6, Artifact path: linear_regression_8
‚úÖ Model registered as 'linear_regression_model' in MLflow Model Registry


Created version '9' of model 'linear_regression_model'.


## Find best model

In [77]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
experiment_name = "Housing_Price_Experiments"
experiment = client.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id

# Get all runs
runs = client.search_runs([experiment_id], order_by=["metrics.r2_score DESC"])  

# Print summary
for r in runs:
    print(f"Run ID: {r.info.run_id}")
    print(f"Run Name: {r.info.run_name}")
    print(f"Model Type: {r.data.tags.get('model_type')}")
    print(f"r2_score: {r.data.metrics.get('r2_score')}")
    print(f"rmse: {r.data.metrics.get('rmse')}")
    print("------")

# Extracting the best run

best_run = runs[0]
best_model_type = best_run.data.tags.get("model_type")
best_run_id = best_run.info.run_id
best_run_name = best_run.info.run_name
best_r2_score = best_run.data.metrics.get("r2_score")

print(f"Best Model: {best_model_type}")
print(f"Run ID: {best_run_id}")
print(f"Run Name: {best_run_name}")
print(f"r2_score: {best_r2_score}")


Run ID: c49c6a13524741cab0b8c1d49288faf6
Run Name: run8_linear_regression
Model Type: linear_regression
r2_score: 0.7011905706891426
rmse: 212539.47189772307
------
Run ID: d99c5220330e4877b7c5fd2a59ff5fd7
Run Name: run7_linear_regression
Model Type: linear_regression
r2_score: 0.7009344347424026
rmse: 212630.54555938562
------
Run ID: c664998ebdd64f658507e4f11aaf3254
Run Name: run5_linear_regression
Model Type: linear_regression
r2_score: 0.6954760300708114
rmse: 214562.18813505
------
Run ID: 06a80da2bc0446419d75c96dc6d4298b
Run Name: run4_linear_regression
Model Type: linear_regression
r2_score: 0.6721259411184505
rmse: 222636.30040050446
------
Run ID: fa9560f99f7548f3b1be58c81d0f267b
Run Name: run6_linear_regression
Model Type: linear_regression
r2_score: 0.6541875978737806
rmse: 228645.537920791
------
Run ID: e6dd9ef75a434537b0cd7d50d67254d1
Run Name: run3_linear_regression
Model Type: linear_regression
r2_score: 0.6240433333683575
rmse: 238402.79531167154
------
Run ID: a96f676