<a href="https://colab.research.google.com/github/alanrlive/Elvtr-AI-Capstone/blob/master/notebooks/Assignment_2_Mini_Pipeline_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# California Housing Regression with scikit‑learn & MLflow (Google Colab)

End‑to‑end ML pipeline **ready to run in Colab**:
1. Install deps in the VM
2. Mount Google Drive for persistent MLflow runs
3. Expose the MLflow UI via **ngrok**
4. Train & register a RandomForest model
5. Optional: serve the model behind a public URL


## 0  Install dependencies

In [None]:
!pip -q install --upgrade pip
!pip -q install scikit-learn==1.4.2 mlflow pyngrok pandas numpy matplotlib seaborn
# this will give errors however ok to proceed

Set up Ngrok

In [None]:
import getpass
import os

# Prompt for token securely (won't be displayed in Colab)
ngrok_token = getpass.getpass('Enter your Ngrok auth token: ')

# Set as environment variable
os.environ['NGROK_AUTH_TOKEN'] = ngrok_token

Enter your Ngrok auth token: ··········


In [None]:
print(os.environ['NGROK_AUTH_TOKEN'][:5] + "..." + os.environ['NGROK_AUTH_TOKEN'][-5:])

30Jpf...oPdEt


Install & Import Dependencies

In [None]:
# Core imports
import os
import numpy as np
import pandas as pd

# Scikit‑learn
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# MLflow
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# Ngrok & async runner
import nest_asyncio
from pyngrok import ngrok


In [None]:
# Configure MLflow to use Google Drive for tracking
import mlflow

mlflow.set_tracking_uri("file:///content/drive/MyDrive/mlruns")
mlflow.set_experiment("DS-Method-California-Housing")

print("MLflow setup complete. Tracking URI:", mlflow.get_tracking_uri())

MLflow setup complete. Tracking URI: file:///content/drive/MyDrive/mlruns


Configure MLflow Tracking & Experiment

In [None]:
# Tell MLflow to use Drive folder
mlflow.set_tracking_uri("file:///content/drive/MyDrive/mlruns")
mlflow.set_experiment("DS-Method-California-Housing")


<Experiment: artifact_location='file:///content/drive/MyDrive/mlruns/597534941026899228', creation_time=1753458650079, experiment_id='597534941026899228', last_update_time=1753458650079, lifecycle_stage='active', name='DS-Method-California-Housing', tags={}>

Launch MLflow UI via Ngrok

In [None]:
# Launch MLflow UI via ngrok, pointing at Drive-backed store
import nest_asyncio
from pyngrok import ngrok
ngrok.set_auth_token(os.environ['NGROK_AUTH_TOKEN'])
nest_asyncio.apply()

get_ipython().system_raw(
    'mlflow ui '
    '--backend-store-uri file:///content/drive/MyDrive/mlruns '
    '--default-artifact-root file:///content/drive/MyDrive/mlruns '
    '--host 0.0.0.0 --port 5000 &'
)

public_url = ngrok.connect(5000, bind_tls=True)
print("MLflow UI is available at:", public_url)

MLflow UI is available at: NgrokTunnel: "https://7277270569a8.ngrok-free.app" -> "http://localhost:5000"


Load & Inspect the Data

In [None]:
# Load dataset
data = fetch_california_housing(as_frame=True)
df = data.frame

# Quick inspect
df.head(), df.describe()


(   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
 0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
 1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
 2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
 3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
 4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
 
    Longitude  MedHouseVal  
 0    -122.23        4.526  
 1    -122.22        3.585  
 2    -122.24        3.521  
 3    -122.25        3.413  
 4    -122.25        3.422  ,
              MedInc      HouseAge      AveRooms     AveBedrms    Population  \
 count  20640.000000  20640.000000  20640.000000  20640.000000  20640.000000   
 mean       3.870671     28.639486      5.429000      1.096675   1425.476744   
 std        1.899822     12.585558      2.474173      0.473911   1132.462122   
 min        0.499900      1.000000      0

Data Cleansing

In [None]:
# 1. Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# Optional: Drop duplicates if any
df.drop_duplicates(inplace=True)

# 2. Log transform skewed features
df['Population'] = np.log1p(df['Population'])  # log(1 + x)
df['AveOccup'] = np.log1p(df['AveOccup'])

# 3. Basic sanity check – no negative values?
if (df < 0).any().any():
    print("Warning: Negative values found")
else:
    print("All values are non-negative.")

Missing values in each column:
 MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [None]:
# Show number of negative values per column
negative_counts = (df < 0).sum()
print("Negative values by column:\n", negative_counts)

# preview some rows
for col in df.columns:
    if (df[col] < 0).any():
        print(f"\nNegative values in column: {col}")
        display(df[df[col] < 0].head())

Negative values by column:
 MedInc             0
HouseAge           0
AveRooms           0
AveBedrms          0
Population         0
AveOccup           0
Latitude           0
Longitude      20640
MedHouseVal        0
dtype: int64

Negative values in column: Longitude


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,5.777652,1.268511,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,7.784057,1.134572,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,6.20859,1.335596,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,6.326149,1.266369,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,6.338594,1.157342,37.85,-122.25,3.422


In [None]:
# Add log-transformed versions
df['Log_AveRooms'] = np.log1p(df['AveRooms'])
df['Log_AveBedrms'] = np.log1p(df['AveBedrms'])
df['Log_Population'] = np.log1p(df['Population'])
df['Log_AveOccup'] = np.log1p(df['AveOccup'])

 Feature Engineering & Split

In [None]:
# Features & target
features = ['MedInc','HouseAge','Latitude','Longitude',
            'Log_AveRooms','Log_AveBedrms','Log_Population','Log_AveOccup']
target   = 'MedHouseVal'

X = df[features]
y = df[target]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)


Evaluate & Log to MLflow
Compute RMSE/R², log params, metrics, model, and signature.

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model',  RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train
pipeline.fit(X_train, y_train)
print("Training complete.")




Training complete.


In [None]:
# Cell 9 – Evaluate & Log to MLflow
from mlflow.models.signature import infer_signature

# Predict & eval
y_pred = pipeline.predict(X_test)
rmse   = mean_squared_error(y_test, y_pred, squared=False)
r2     = r2_score(y_test, y_pred)

# Log in MLflow and capture run_id
with mlflow.start_run() as run:
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    signature = infer_signature(X_test, y_pred)
    mlflow.sklearn.log_model(
        pipeline,
        artifact_path="model",
        signature=signature,
        input_example=X_test.iloc[:5]
    )

    run_id = run.info.run_id  # ← capture it here


print(f"Logged run → RMSE: {rmse:.4f}, R²: {r2:.4f}")
print("Loaded run_id:", run_id)



Logged run → RMSE: 0.5060, R²: 0.8046
Loaded run_id: 8eef7591ef204221b1b8a4689bfb5d49


In [None]:
from mlflow.tracking import MlflowClient

client       = MlflowClient()
exp         = client.get_experiment_by_name("DS-Method-California-Housing")
exp_id      = exp.experiment_id
runs        = client.search_runs(experiment_ids=[exp_id],
                                order_by=["attributes.start_time DESC"],
                                max_results=1)
latest_run  = runs[0]
run_id      = latest_run.info.run_id

model_uri   = f"runs:/{run_id}/model"
loaded_pipe = mlflow.sklearn.load_model(model_uri)
print("Loaded run_id:", run_id)
print("Sample preds:", loaded_pipe.predict(X_test.iloc[:3]))


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Loaded run_id: 8eef7591ef204221b1b8a4689bfb5d49
Sample preds: [0.50784  0.74033  4.904767]


Build & Train the Pipeline

(Optional) Mock Deployment Snippet

In [None]:
# Load the logged model for inference
model_uri = f"runs:/{run_id}/model"
loaded_pipeline = mlflow.sklearn.load_model(model_uri)

# Sample prediction
print("Sample prediction:", loaded_pipeline.predict(X_test.iloc[:3]))


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Sample prediction: [0.50784  0.74033  4.904767]
