# Capstone Project

In [1]:
!pip show pandas
!pip install pyarrow
!pip install mlflow

Name: pandas
Version: 1.4.2
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: The Pandas Development Team
Author-email: pandas-dev@python.org
License: BSD-3-Clause
Location: /home/azureuser/anaconda3/lib/python3.9/site-packages
Requires: python-dateutil, pytz, numpy
Required-by: xarray, statsmodels, seaborn, mlflow, hvplot, holoviews, evidently, datashader


In [2]:
from sklearn.model_selection import train_test_split


In [1]:
import pandas as pd; 
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
df = pd.read_csv(url, delim_whitespace=True, names=["mpg","cylinders","displacement","horsepower","weight","acceleration","model_year","origin","car_name"], na_values='?')


In [2]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


# Day 2

In [5]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [6]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [7]:
df['horsepower'].fillna(df['horsepower'].mean(), inplace=True)

In [8]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [9]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'car_name'],
      dtype='object')

In [10]:
import numpy as np

In [11]:
# Drop rows with missing target
df = df.dropna(subset=['mpg'])

# Separate features and target
X = df.drop(columns=['mpg','car_name'])
y = df['mpg']

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [12]:
print(X_train.shape, X_test.shape)

(318, 7) (80, 7)


In [13]:
print(y)

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64


In [14]:
# Linear Regression
import mlflow
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from mlflow.models import infer_signature
import numpy as np

mlflow.set_tracking_uri("http://4.227.222.56:5000")
mlflow.set_experiment("MpgEstimationExperiment")

with mlflow.start_run(run_name="LinearRegression"):
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_param("model_type", "Linear Regression")

    signature = infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train,
        registered_model_name="MpgEstimator"
    )

2025/08/11 10:29:30 INFO mlflow.tracking.fluent: Experiment with name 'MpgEstimationExperiment' does not exist. Creating a new experiment.
Successfully registered model 'MpgEstimator'.
2025/08/11 10:29:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MpgEstimator, version 1


🏃 View run LinearRegression at: http://4.227.222.56:5000/#/experiments/2/runs/936a27e8d47f430da4ba09d53bca7d6f
🧪 View experiment at: http://4.227.222.56:5000/#/experiments/2


Created version '1' of model 'MpgEstimator'.


In [15]:
# Ridge Regression
from sklearn.linear_model import Ridge

with mlflow.start_run(run_name="RidgeRegression"):
    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_param("model_type", "Ridge Regression")

    signature = infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train,
        registered_model_name="MpgEstimator"
    )

Registered model 'MpgEstimator' already exists. Creating a new version of this model...
2025/08/11 10:31:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MpgEstimator, version 2


🏃 View run RidgeRegression at: http://4.227.222.56:5000/#/experiments/2/runs/76493c06fe92414b98941c07409343b8
🧪 View experiment at: http://4.227.222.56:5000/#/experiments/2


Created version '2' of model 'MpgEstimator'.


In [16]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor

with mlflow.start_run(run_name="RandomForest"):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_param("model_type", "Random Forest")

    signature = infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train,
        registered_model_name="MpgEstimator"
    )

Registered model 'MpgEstimator' already exists. Creating a new version of this model...
2025/08/11 10:31:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MpgEstimator, version 3


🏃 View run RandomForest at: http://4.227.222.56:5000/#/experiments/2/runs/a0d91a3154d1477f97ce16cc9991d858
🧪 View experiment at: http://4.227.222.56:5000/#/experiments/2


Created version '3' of model 'MpgEstimator'.


In [17]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

with mlflow.start_run(run_name="GradientBoosting"):
    model = GradientBoostingRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_param("model_type", "Gradient Boosting")

    signature = infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train,
        registered_model_name="MpgEstimator"
    )

Registered model 'MpgEstimator' already exists. Creating a new version of this model...
2025/08/11 10:31:42 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MpgEstimator, version 4


🏃 View run GradientBoosting at: http://4.227.222.56:5000/#/experiments/2/runs/d81a793cf247423284bff71f809407f7
🧪 View experiment at: http://4.227.222.56:5000/#/experiments/2


Created version '4' of model 'MpgEstimator'.


In [20]:
# XGBoost
!pip install xgboost lightgbm
from xgboost import XGBRegressor

with mlflow.start_run(run_name="XGBoost"):
    model = XGBRegressor(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_param("model_type", "XGBoost")

    signature = infer_signature(X_train, model.predict(X_train))
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        signature=signature,
        input_example=X_train,
        registered_model_name="MpgEstimator"
    )

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-manylinux_2_28_x86_64.whl (223.6 MB)
[K     |████████████████████████████████| 223.6 MB 4.2 kB/s s eta 0:00:01
[?25hCollecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 68.9 MB/s eta 0:00:01
[?25hCollecting nvidia-nccl-cu12
  Downloading nvidia_nccl_cu12-2.27.7-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322.5 MB)
[K     |███████████████████████████▊    | 279.8 MB 98.6 MB/s eta 0:00:01

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[K     |████████████████████████████████| 322.5 MB 21 kB/s 
Installing collected packages: nvidia-nccl-cu12, xgboost, lightgbm
Successfully installed lightgbm-4.6.0 nvidia-nccl-cu12-2.27.7 xgboost-2.1.4


Registered model 'MpgEstimator' already exists. Creating a new version of this model...
2025/08/11 10:36:00 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MpgEstimator, version 5


🏃 View run XGBoost at: http://4.227.222.56:5000/#/experiments/2/runs/e7b5919039124025b95c98065e13a43a
🧪 View experiment at: http://4.227.222.56:5000/#/experiments/2


Created version '5' of model 'MpgEstimator'.


In [1]:
import mlflow
import pickle

# Connect to your MLflow tracking server
mlflow.set_tracking_uri("http://4.227.222.56:5000")

# Registered model details
reg_model_name = "MpgEstimator"
model_version = 3
model_uri = f"models:/{reg_model_name}/{model_version}"

# Load the model from MLflow
loaded_model = mlflow.sklearn.load_model(model_uri)
print(f"Loaded model: {reg_model_name} v{model_version}")

# Save to local file as lr_model.bin
with open("lr_model.bin", "wb") as f_out:
    pickle.dump(loaded_model, f_out)

print("Model saved locally as lr_model.bin")


Loaded model: MpgEstimator v3
Model saved locally as lr_model.bin
