In [1]:
# !pip install mlflow

In [2]:
import mlflow
import mlflow.sklearn
import os
import shutil

mlflow_folder = '../mlflowData'

if os.path.exists(mlflow_folder):
    shutil.rmtree(mlflow_folder)    

os.makedirs(mlflow_folder, exist_ok=True)

print(f"Clean mlflowData folder ready at: {os.path.abspath(mlflow_folder)}")


Clean mlflowData folder ready at: c:\Users\arkha\jupyter-workspace\medical_insurance_cost_prediction_project\mlflowData


In [3]:
trackingURI = 'file:///' + os.path.abspath(mlflow_folder).replace("\\","/")

mlflow.set_tracking_uri(trackingURI)

mlflow.set_experiment('Medical_Insurance_Cost_Prediction')

2025/08/16 16:25:16 INFO mlflow.tracking.fluent: Experiment with name 'Medical_Insurance_Cost_Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/arkha/jupyter-workspace/medical_insurance_cost_prediction_project/mlflowData/610449148954122992', creation_time=1755341716070, experiment_id='610449148954122992', last_update_time=1755341716070, lifecycle_stage='active', name='Medical_Insurance_Cost_Prediction', tags={}>

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [5]:
def mlflow_train_and_log(model,modelName, xTrain,xTest,yTrain,yTest, params=None):

    with mlflow.start_run(run_name=modelName):
            
        model.fit(xTrain,yTrain)
        yPred = model.predict(xTest)

        rmse = np.sqrt(mean_squared_error(yTest,yPred))
        mae = mean_absolute_error(yTest, yPred)
        r2 = r2_score(yTest, yPred)

        #log params
        if params:
            mlflow.log_params(params)


        #log metrics
        mlflow.log_metric('RMSE',rmse)
        mlflow.log_metric('MAE',mae)
        mlflow.log_metric('R2',r2)

        #log model
        mlflow.sklearn.log_model(model, modelName)


        # Print metrics
        print(f"Model: {modelName}")
        print(f"RMSE: {rmse:.2f}")
        print(f"MAE: {mae:.2f}")
        print(f"R²: {r2:.2f}\n")               
        

In [6]:
df = pd.read_csv('../data/cleaned_data.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest,bmi_category_normal,bmi_category_overweight,bmi_category_underweight,smoker_bmi
0,19,0,27.9,0,1,16884.924,0,0,1,0,1,0,27.9
1,18,1,33.77,1,0,1725.5523,0,1,0,0,0,0,0.0
2,28,1,33.0,3,0,4449.462,0,1,0,0,0,0,0.0
3,33,1,22.705,0,0,21984.47061,1,0,0,1,0,0,0.0
4,32,1,28.88,0,0,3866.8552,1,0,0,0,1,0,0.0


In [7]:
x = df.drop('charges', axis=1)
y = df['charges']

xTrain, xTest, yTrain, yTest = train_test_split(
    x,y, test_size=0.2, random_state=42
)

xTrain.shape, xTest.shape, yTrain.shape, yTest.shape

((1069, 12), (268, 12), (1069,), (268,))

In [8]:
scaler = StandardScaler()

xTrainScaled = scaler.fit_transform(xTrain)
xTestScaled = scaler.transform(xTest)

In [9]:

poly = PolynomialFeatures(degree=2)
xTrainPoly = poly.fit_transform(xTrainScaled)
xTestPoly = poly.transform(xTestScaled)

In [10]:
# import joblib
# joblib.dump(scaler, "../models/scaler.pkl")
# joblib.dump(poly, "../models/poly_features.pkl")

In [11]:
# Models to log
mlflow_train_and_log(LinearRegression(), "Linear Regression",
                      xTrainScaled, xTestScaled, yTrain, yTest)
mlflow_train_and_log(Ridge(alpha=1.0), "Ridge Regression",
                      xTrainScaled, xTestScaled, yTrain, yTest,
                        params={"alpha": 1.0})
mlflow_train_and_log(Lasso(alpha=0.1), "Lasso Regression",
                      xTrainScaled, xTestScaled, yTrain, yTest,
                        params={"alpha": 0.1})

# Polynomial Regression
poly_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LinearRegression())
])

mlflow_train_and_log(poly_pipeline,"Polynomial Regression (deg=2)",
    xTrain, xTest,
    yTrain, yTest,
    params={"degree": 2}
)
import joblib
joblib.dump(poly_pipeline, "../models/PolynomialRegressionPipeline.pkl")
print("Pipeline saved locally for Streamlit!")


mlflow_train_and_log(KNeighborsRegressor(n_neighbors=5), "KNN Regressor",
                      xTrainScaled, xTestScaled, yTrain, yTest,
                        params={"n_neighbors": 5})
mlflow_train_and_log(RandomForestRegressor(n_estimators=100, random_state=42),
                      "Random Forest", xTrainScaled, xTestScaled, yTrain, yTest,
                        params={"n_estimators": 100})
mlflow_train_and_log(xgb.XGBRegressor(
    n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42), "XGBoost", 
    xTrainScaled, xTestScaled, yTrain, yTest, 
    params={"n_estimators": 100, "learning_rate": 0.1, "max_depth": 3})

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Model: Linear Regression
RMSE: 4560.55
MAE: 2813.07
R²: 0.89





Model: Ridge Regression
RMSE: 4553.15
MAE: 2810.58
R²: 0.89





Model: Lasso Regression
RMSE: 4560.45
MAE: 2813.02
R²: 0.89





Model: Polynomial Regression (deg=2)
RMSE: 4193.11
MAE: 2406.38
R²: 0.90

Pipeline saved locally for Streamlit!




Model: KNN Regressor
RMSE: 5126.23
MAE: 3170.60
R²: 0.86





Model: Random Forest
RMSE: 4729.86
MAE: 2678.06
R²: 0.88





Model: XGBoost
RMSE: 4278.53
MAE: 2470.45
R²: 0.90



In [12]:
from mlflow.tracking import MlflowClient
import shutil

metricToOptimize = 'R2'

experiment = mlflow.get_experiment_by_name('Medical_Insurance_Cost_Prediction')
experimentId = experiment.experiment_id

client = MlflowClient()
runs = client.search_runs(
    experiment_ids=[experimentId],
    order_by=[f"metrics.{metricToOptimize} DESC"]
)

bestRun = runs[0]
bestModelName = bestRun.data.tags.get("mlflow.runName")
bestRunId = bestRun.info.run_id
bestScore = bestRun.data.metrics[metricToOptimize]

print(f"Best model: {bestModelName}")
print(f"{metricToOptimize}: {bestScore:.4f}")
print(f"Run ID: {bestRunId}")

registry_model_name = "MedicalInsuranceCostModel"

modelURI = f"runs:/{bestRunId}/{bestModelName}"
result = mlflow.register_model(model_uri=modelURI, name = registry_model_name)

print(f"Registered as '{registry_model_name}', version: {result.version}")


#
model_local_path = client.download_artifacts(run_id=bestRunId, path=bestModelName)
model_dst_path = os.path.join("..", "models", bestModelName)

shutil.copytree(model_local_path, model_dst_path, dirs_exist_ok=True)
print(f"Best model also copied to: {model_dst_path}")

Successfully registered model 'MedicalInsuranceCostModel'.


Best model: Polynomial Regression (deg=2)
R2: 0.9043
Run ID: 19b94e72f6b343258acf235108768628
Registered as 'MedicalInsuranceCostModel', version: 1


Created version '1' of model 'MedicalInsuranceCostModel'.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best model also copied to: ..\models\Polynomial Regression (deg=2)
