In [1]:
!python -V

Python 3.12.1


In [2]:
%cd ..

/workspaces/mlops-student-performance


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from joblib import dump

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope

import mlflow

In [4]:
random_state = 42


# Configuring MLflow

In [5]:
TRACKING_SERVER_HOST = "localhost"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("student_performance")


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1727577653830, experiment_id='1', last_update_time=1727577653830, lifecycle_stage='active', name='student_performance', tags={'project': 'Student Performance', 'task': 'Regression'}>

In [6]:
mlflow.set_experiment_tags(
    {
    "project" : "Student Performance",
    "task": "Regression"
    }
)
print(f"tracking URI: {mlflow.get_tracking_uri()}")

tracking URI: http://localhost:5000


# Train Random Regressor

In [7]:
def train(data_path, max_depth, max_features, n_estimators, random_state, split_test_size=0.33):
    run = mlflow.start_run()

    mlflow.log_param("train_data_path", data_path)

    students_df = pd.read_csv(data_path)

    X = students_df.drop(columns=['Exam_Score'])
    y = students_df.loc[:, 'Exam_Score']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, shuffle=True, random_state=random_state)

    rf_regressor = RandomForestRegressor(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    data_params = {
        'split_test_size': split_test_size,
        'random_state': random_state,
        'x_train_shape': X_train.shape,
        'x_test_shape': X_test.shape,
        'y_train_shape': y_train.shape,
        'y_test_shape': y_test.shape
    }

    mlflow.log_params(data_params)

    ml_params = {
        f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
    }
    mlflow.log_params(ml_params)
    
    ml_metrics = {'mse':mse, 'mae': mae}
    mlflow.log_metrics(ml_metrics)
    print(f'mse:{mse}, mae:{mae}')

    with open("models/random_regresor.joblib", "wb") as f:
        dump(rf_regressor, f, protocol=5)

    mlflow.set_tag('model', 'random-forest-regressor')

    print(f"Experiment ID: {run.info.experiment_id}")
    print(f"Run ID: {run.info.run_id}")

    mlflow.end_run()
    return rf_regressor


In [8]:
data_path = 'data/processed/StudentPerformanceFactors.csv'

max_depth = 8
max_features = 0.75
n_estimators = 200
train(data_path=data_path, max_depth=max_depth, max_features=max_features, n_estimators=n_estimators, random_state=random_state)

2024/10/01 18:48:53 INFO mlflow.tracking._tracking_service.client: 🏃 View run enchanting-horse-48 at: http://localhost:5000/#/experiments/1/runs/35f4418b69204731a7992bbceab8b699.
2024/10/01 18:48:53 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


mse:6.289281356557213, mae:1.296352303483074
Experiment ID: 1
Run ID: 35f4418b69204731a7992bbceab8b699


# Hyperparameter Optimization

In [9]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

def objective(params):
    with mlflow.start_run() as run:
        mlflow.log_param("train_data_path", data_path)

        students_df = pd.read_csv(data_path)

        X = students_df.drop(columns=['Exam_Score'])
        y = students_df.loc[:, 'Exam_Score']
        
        rf_regressor = RandomForestRegressor(**params)

        mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # Set greater_is_better=False to minimize
        mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)    

        scoring = {
            'mse': mse_scorer,
            'mae': mae_scorer
        }
        scores = cross_validate(rf_regressor, X, y, scoring=scoring, cv=5)
        
        mse = -scores['test_mse'].mean()  # Negate MSE for consistency
        mae = -scores['test_mae'].mean()

        data_params = {
            'random_state': random_state,
        }

        mlflow.log_params(data_params)

        ml_params = {
            f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
        }
        mlflow.log_params(ml_params)
        
        ml_metrics = {'mse':mse, 'mae': mae}
        mlflow.log_metrics(ml_metrics)
        print(f'mse:{mse}, mae:{mae}')

        mlflow.set_tag('model', 'random-forest-regressor')

        print(f"Experiment ID: {run.info.experiment_id}")
        print(f"Run ID: {run.info.run_id}")

        mlflow.end_run()

        return {'loss': mse, 'status': STATUS_OK, 'params': params}
    


In [10]:
search_space = {
    'max_depth': scope.int(hp.choice('max_depth', [10, 20, 30])),
    'max_features': hp.choice('max_features', ['sqrt','log2']),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 50)),
}

In [11]:
trials = Trials()
best = fmin(fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=5,
            trials=trials
            )

best_params = trials.best_trial['result']['params']

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

mse:5.607716789422468, mae:1.1369348230103606        
Experiment ID: 1                                     
Run ID: 21a3fdda83cb4628a262089046d04e6f             
  0%|          | 0/5 [00:05<?, ?trial/s, best loss=?]

2024/10/01 18:48:58 INFO mlflow.tracking._tracking_service.client: 🏃 View run merciful-hen-497 at: http://localhost:5000/#/experiments/1/runs/21a3fdda83cb4628a262089046d04e6f.

2024/10/01 18:48:58 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.613260017097721, mae:1.1434583442600765                                 
Experiment ID: 1                                                              
Run ID: 7ed2bdb10c604db591697ce4bd01fb78                                      
 20%|██        | 1/5 [00:10<00:21,  5.35s/trial, best loss: 5.607716789422468]

2024/10/01 18:49:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run puzzled-mole-227 at: http://localhost:5000/#/experiments/1/runs/7ed2bdb10c604db591697ce4bd01fb78.

2024/10/01 18:49:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.615241530244941, mae:1.1384594166381286                                 
Experiment ID: 1                                                              
Run ID: a9634ce5c9d0442fb74cf81e07b1d318                                      
 40%|████      | 2/5 [00:17<00:15,  5.31s/trial, best loss: 5.607716789422468]

2024/10/01 18:49:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run invincible-doe-215 at: http://localhost:5000/#/experiments/1/runs/a9634ce5c9d0442fb74cf81e07b1d318.

2024/10/01 18:49:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.652889742199179, mae:1.1554750474626179                                 
Experiment ID: 1                                                              
Run ID: 6477a99f779d4f3bb5f806cee96a0ed5                                      
 60%|██████    | 3/5 [00:21<00:12,  6.01s/trial, best loss: 5.607716789422468]

2024/10/01 18:49:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run ambitious-ray-837 at: http://localhost:5000/#/experiments/1/runs/6477a99f779d4f3bb5f806cee96a0ed5.

2024/10/01 18:49:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.635654425072378, mae:1.1456452906489734                                 
Experiment ID: 1                                                              
Run ID: be85a1cf18c747b5978e0cd9f6d67e06                                      
 80%|████████  | 4/5 [00:26<00:05,  5.09s/trial, best loss: 5.607716789422468]

2024/10/01 18:49:19 INFO mlflow.tracking._tracking_service.client: 🏃 View run monumental-gnat-187 at: http://localhost:5000/#/experiments/1/runs/be85a1cf18c747b5978e0cd9f6d67e06.

2024/10/01 18:49:19 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



100%|██████████| 5/5 [00:26<00:00,  5.26s/trial, best loss: 5.607716789422468]


# Save Best Model

In [22]:
from mlflow.models.signature import infer_signature

artifact_path = 'models/random_regresor.joblib'


with mlflow.start_run() as run:
    mlflow.log_param("train_data_path", data_path)

    students_df = pd.read_csv(data_path)

    X = students_df.drop(columns=['Exam_Score']).astype('float64')
    y = students_df.loc[:, 'Exam_Score'].astype('float64')

    rf_regressor = RandomForestRegressor(**best_params)

    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # Set greater_is_better=False to minimize
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)    

    scoring = {
        'mse': mse_scorer,
        'mae': mae_scorer
    }
    scores = cross_validate(rf_regressor, X, y, scoring=scoring, cv=5)

    mse = -scores['test_mse'].mean()  # Negate MSE for consistency
    mae = -scores['test_mae'].mean()

    data_params = {
        'random_state': random_state,
    }

    mlflow.log_params(data_params)

    ml_params = {
        f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
    }
    mlflow.log_params(ml_params)

    ml_metrics = {'mse':mse, 'mae': mae}
    mlflow.log_metrics(ml_metrics)
    print(f'mse:{mse}, mae:{mae}')

    mlflow.set_tag('model', 'random-forest-regressor')

    rf_regressor.fit(X,y)

    input_example = X.sample(1)
    signature = infer_signature(X, y)

    mlflow.sklearn.log_model(rf_regressor, artifact_path=artifact_path, signature=signature, input_example=input_example)
    artifact_uri = mlflow.get_artifact_uri()
    print(f'artifact_uri: {artifact_uri}')

    print(f"Experiment ID: {run.info.experiment_id}")
    print(f"Run ID: {run.info.run_id}")


mse:5.617175218019595, mae:1.1451148460671625


Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  2.76it/s]
2024/10/01 19:14:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run abundant-robin-847 at: http://localhost:5000/#/experiments/1/runs/0128e9a17e874cdbb8e309e16ed1c02b.
2024/10/01 19:14:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


artifact_uri: mlflow-artifacts:/1/0128e9a17e874cdbb8e309e16ed1c02b/artifacts
Experiment ID: 1
Run ID: 0128e9a17e874cdbb8e309e16ed1c02b
