In [1]:
!python -V

Python 3.12.1


In [2]:
%cd ..

/workspaces/mlops-student-performance


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from joblib import dump

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope

import mlflow

In [4]:
random_state = 42


# Configuring MLflow

In [5]:
TRACKING_SERVER_HOST = "localhost"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("student_performance")


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1727577653830, experiment_id='1', last_update_time=1727577653830, lifecycle_stage='active', name='student_performance', tags={'project': 'Student Performance', 'task': 'Regression'}>

In [6]:
mlflow.set_experiment_tags(
    {
    "project" : "Student Performance",
    "task": "Regression"
    }
)
print(f"tracking URI: {mlflow.get_tracking_uri()}")

tracking URI: http://localhost:5000


# Train Random Regressor

In [7]:
def train(data_path, max_depth, max_features, n_estimators, random_state, split_test_size=0.33):
    run = mlflow.start_run()

    mlflow.log_param("train_data_path", data_path)

    students_df = pd.read_csv(data_path)

    X = students_df.drop(columns=['Exam_Score'])
    y = students_df.loc[:, 'Exam_Score']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, shuffle=True, random_state=random_state)

    rf_regressor = RandomForestRegressor(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    data_params = {
        'split_test_size': split_test_size,
        'random_state': random_state,
        'x_train_shape': X_train.shape,
        'x_test_shape': X_test.shape,
        'y_train_shape': y_train.shape,
        'y_test_shape': y_test.shape
    }

    mlflow.log_params(data_params)

    ml_params = {
        f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
    }
    mlflow.log_params(ml_params)
    
    ml_metrics = {'mse':mse, 'mae': mae}
    mlflow.log_metrics(ml_metrics)
    print(f'mse:{mse}, mae:{mae}')

    with open("models/random_regresor.joblib", "wb") as f:
        dump(rf_regressor, f, protocol=5)

    mlflow.set_tag('model', 'random-forest-regressor')

    print(f"Experiment ID: {run.info.experiment_id}")
    print(f"Run ID: {run.info.run_id}")

    mlflow.end_run()
    return rf_regressor


In [8]:
data_path = 'data/processed/StudentPerformanceFactors.csv'

max_depth = 8
max_features = 0.75
n_estimators = 200
train(data_path=data_path, max_depth=max_depth, max_features=max_features, n_estimators=n_estimators, random_state=random_state)

2024/10/07 17:15:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run nervous-wolf-105 at: http://localhost:5000/#/experiments/1/runs/d066724eba6d4306a86e866df7fb6089.
2024/10/07 17:15:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


mse:6.329237439903726, mae:1.2972006042550785
Experiment ID: 1
Run ID: d066724eba6d4306a86e866df7fb6089


# Hyperparameter Optimization

In [9]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

def objective(params):
    with mlflow.start_run() as run:
        mlflow.log_param("train_data_path", data_path)

        students_df = pd.read_csv(data_path)

        X = students_df.drop(columns=['Exam_Score'])
        y = students_df.loc[:, 'Exam_Score']
        
        rf_regressor = RandomForestRegressor(**params)

        mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # Set greater_is_better=False to minimize
        mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)    

        scoring = {
            'mse': mse_scorer,
            'mae': mae_scorer
        }
        scores = cross_validate(rf_regressor, X, y, scoring=scoring, cv=5)
        
        mse = -scores['test_mse'].mean()  # Negate MSE for consistency
        mae = -scores['test_mae'].mean()

        data_params = {
            'random_state': random_state,
        }

        mlflow.log_params(data_params)

        ml_params = {
            f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
        }
        mlflow.log_params(ml_params)
        
        ml_metrics = {'mse':mse, 'mae': mae}
        mlflow.log_metrics(ml_metrics)
        print(f'mse:{mse}, mae:{mae}')

        mlflow.set_tag('model', 'random-forest-regressor')

        print(f"Experiment ID: {run.info.experiment_id}")
        print(f"Run ID: {run.info.run_id}")

        mlflow.end_run()

        return {'loss': mse, 'status': STATUS_OK, 'params': params}
    


In [10]:
search_space = {
    'max_depth': scope.int(hp.choice('max_depth', [10, 20, 30])),
    'max_features': hp.choice('max_features', ['sqrt','log2']),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 50)),
}

In [11]:
trials = Trials()
best = fmin(fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=5,
            trials=trials
            )

best_params = trials.best_trial['result']['params']

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

mse:5.604520301839405, mae:1.137925878715522         
Experiment ID: 1                                     
Run ID: b6b0f0cd1f4b48dbaa099a0b0c991c98             
  0%|          | 0/5 [00:07<?, ?trial/s, best loss=?]

2024/10/07 17:15:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run blushing-toad-119 at: http://localhost:5000/#/experiments/1/runs/b6b0f0cd1f4b48dbaa099a0b0c991c98.

2024/10/07 17:15:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.986152843299827, mae:1.2534191473194856                                 
Experiment ID: 1                                                              
Run ID: c3342dd507934bc2b2e533cc63ed5aff                                      
 20%|██        | 1/5 [00:11<00:29,  7.47s/trial, best loss: 5.604520301839405]

2024/10/07 17:15:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run indecisive-yak-109 at: http://localhost:5000/#/experiments/1/runs/c3342dd507934bc2b2e533cc63ed5aff.

2024/10/07 17:15:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.591607948671723, mae:1.1349545414592168                                 
Experiment ID: 1                                                              
Run ID: 81bc0f19c7854ce1a2f9b0a88999572d                                      
 40%|████      | 2/5 [00:18<00:16,  5.58s/trial, best loss: 5.604520301839405]

2024/10/07 17:15:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run fortunate-shrimp-481 at: http://localhost:5000/#/experiments/1/runs/81bc0f19c7854ce1a2f9b0a88999572d.

2024/10/07 17:15:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.638042646661946, mae:1.1435553012204531                                 
Experiment ID: 1                                                              
Run ID: 1a847a4d692d4d62b8a72d5abc1c8153                                      
 60%|██████    | 3/5 [00:23<00:12,  6.16s/trial, best loss: 5.591607948671723]

2024/10/07 17:15:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run abundant-duck-571 at: http://localhost:5000/#/experiments/1/runs/1a847a4d692d4d62b8a72d5abc1c8153.

2024/10/07 17:15:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.708761416247301, mae:1.1772508881197958                                 
Experiment ID: 1                                                              
Run ID: b3c086aadcd74006b260d0a88cca1352                                      
 80%|████████  | 4/5 [00:25<00:05,  5.84s/trial, best loss: 5.591607948671723]

2024/10/07 17:15:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run big-bat-679 at: http://localhost:5000/#/experiments/1/runs/b3c086aadcd74006b260d0a88cca1352.

2024/10/07 17:15:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



100%|██████████| 5/5 [00:25<00:00,  5.16s/trial, best loss: 5.591607948671723]


# Save Best Model

In [12]:
from mlflow.models.signature import infer_signature

artifact_path = 'models/random_regresor.joblib'


with mlflow.start_run() as run:
    mlflow.log_param("train_data_path", data_path)

    students_df = pd.read_csv(data_path)

    X = students_df.drop(columns=['Exam_Score']).astype('float64')
    y = students_df.loc[:, 'Exam_Score'].astype('float64')

    rf_regressor = RandomForestRegressor(**best_params)

    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # Set greater_is_better=False to minimize
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)    

    scoring = {
        'mse': mse_scorer,
        'mae': mae_scorer
    }
    scores = cross_validate(rf_regressor, X, y, scoring=scoring, cv=5)

    mse = -scores['test_mse'].mean()  # Negate MSE for consistency
    mae = -scores['test_mae'].mean()

    data_params = {
        'random_state': random_state,
    }

    mlflow.log_params(data_params)

    ml_params = {
        f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
    }
    mlflow.log_params(ml_params)

    ml_metrics = {'mse':mse, 'mae': mae}
    mlflow.log_metrics(ml_metrics)
    print(f'mse:{mse}, mae:{mae}')

    mlflow.set_tag('model', 'random-forest-regressor')

    rf_regressor.fit(X,y)

    input_example = X.sample(1)
    signature = infer_signature(X, y)

    mlflow.sklearn.log_model(rf_regressor, artifact_path=artifact_path, signature=signature, input_example=input_example)
    artifact_uri = mlflow.get_artifact_uri()
    print(f'artifact_uri: {artifact_uri}')

    print(f"Experiment ID: {run.info.experiment_id}")
    print(f"Run ID: {run.info.run_id}")


mse:5.596756605910697, mae:1.1367479341078124


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  2.88it/s]
2024/10/07 17:16:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run upset-conch-683 at: http://localhost:5000/#/experiments/1/runs/60d4d8659a054b61bafbaa8c6c27d3cf.
2024/10/07 17:16:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


artifact_uri: mlflow-artifacts:/1/60d4d8659a054b61bafbaa8c6c27d3cf/artifacts
Experiment ID: 1
Run ID: 60d4d8659a054b61bafbaa8c6c27d3cf
