In [1]:
!python -V

Python 3.12.1


In [2]:
%cd ..

/workspaces/mlops-student-performance


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from joblib import dump

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope

import mlflow

In [4]:
random_state = 42


# Configuring MLflow

In [7]:
TRACKING_SERVER_HOST = "localhost"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("student_performance")
mlflow.set_experiment_tags(
    {
    "project" : "Student Performance",
    "task": "Regression"
    }
)

In [6]:
print(f"tracking URI: {mlflow.get_tracking_uri()}")

tracking URI: http://localhost:5000


# Train Random Regressor

In [22]:
def train(data_path, max_depth, max_features, n_estimators, random_state, split_test_size=0.33):
    run = mlflow.start_run()

    mlflow.log_param("train_data_path", data_path)

    students_df = pd.read_csv(data_path)

    X = students_df.drop(columns=['Exam_Score'])
    y = students_df.loc[:, 'Exam_Score']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, shuffle=True, random_state=random_state)

    rf_regressor = RandomForestRegressor(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    data_params = {
        'split_test_size': split_test_size,
        'random_state': random_state,
        'x_train_shape': X_train.shape,
        'x_test_shape': X_test.shape,
        'y_train_shape': y_train.shape,
        'y_test_shape': y_test.shape
    }

    mlflow.log_params(data_params)

    ml_params = {
        f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
    }
    mlflow.log_params(ml_params)
    
    ml_metrics = {'mse':mse, 'mae': mae}
    mlflow.log_metrics(ml_metrics)
    print(f'mse:{mse}, mae:{mae}')

    with open("models/random_regresor.joblib", "wb") as f:
        dump(rf_regressor, f, protocol=5)

    mlflow.set_tag('model', 'random-forest-regressor')

    print(f"Experiment ID: {run.info.experiment_id}")
    print(f"Run ID: {run.info.run_id}")

    mlflow.end_run()
    return rf_regressor


In [23]:
data_path = 'data/processed/StudentPerformanceFactors.csv'

max_depth = 8
max_features = 0.75
n_estimators = 200
train(data_path=data_path, max_depth=max_depth, max_features=max_features, n_estimators=n_estimators, random_state=random_state)

2024/09/29 04:24:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run righteous-turtle-743 at: http://localhost:5000/#/experiments/1/runs/4628402ac2cc4caebe2a75662210e52b.
2024/09/29 04:24:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


mse:6.32603068706042, mae:1.3044302251509354
Experiment ID: 1
Run ID: 4628402ac2cc4caebe2a75662210e52b


# Hyperparameter Optimization

In [28]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

def objective(params):
    run = mlflow.start_run()
    mlflow.log_param("train_data_path", data_path)

    students_df = pd.read_csv(data_path)

    X = students_df.drop(columns=['Exam_Score'])
    y = students_df.loc[:, 'Exam_Score']
    
    rf_regressor = RandomForestRegressor(**params)

    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # Set greater_is_better=False to minimize
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

    mse = -cross_val_score(rf_regressor, X, y, scoring=mse_scorer, cv=5).mean() # Negate to get positive MSE
    mae =-cross_val_score(rf_regressor, X, y, scoring=mae_scorer, cv=5).mean()

    data_params = {
        'random_state': random_state,
    }

    mlflow.log_params(data_params)

    ml_params = {
        f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
    }
    mlflow.log_params(ml_params)
    
    ml_metrics = {'mse':mse, 'mae': mae}
    mlflow.log_metrics(ml_metrics)
    print(f'mse:{mse}, mae:{mae}')

    mlflow.set_tag('model', 'random-forest-regressor')

    print(f"Experiment ID: {run.info.experiment_id}")
    print(f"Run ID: {run.info.run_id}")

    mlflow.end_run()

    return {'loss': mse, 'status': STATUS_OK, 'params': params}
    


In [29]:
search_space = {
    'max_depth': scope.int(hp.choice('max_depth', [10, 20, 30])),
    'max_features': hp.choice('max_features', ['sqrt','log2']),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 50)),
}

In [30]:
trials = Trials()
best = fmin(fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=5,
            trials=trials
            )

best_params = trials.best_trial['result']['params']

mse:5.995287813000732, mae:1.2733807619710833        
Experiment ID: 1                                     
Run ID: 5095e933c2bb476dbd0324dc34de6b00             
  0%|          | 0/5 [00:04<?, ?trial/s, best loss=?]

2024/09/29 04:50:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run blushing-kite-430 at: http://localhost:5000/#/experiments/1/runs/5095e933c2bb476dbd0324dc34de6b00.

2024/09/29 04:50:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.608076504357662, mae:1.1277435375951033                                 
Experiment ID: 1                                                              
Run ID: 69d11adeba204ec38efead7f64ee5c75                                      
 20%|██        | 1/5 [00:21<00:17,  4.42s/trial, best loss: 5.995287813000732]

2024/09/29 04:51:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run clumsy-duck-886 at: http://localhost:5000/#/experiments/1/runs/69d11adeba204ec38efead7f64ee5c75.

2024/09/29 04:51:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.671050247443336, mae:1.1465253880658617                                 
Experiment ID: 1                                                              
Run ID: 0ffcc3dd4e234c4ab59143d122864011                                      
 40%|████      | 2/5 [00:28<00:35, 11.79s/trial, best loss: 5.608076504357662]

2024/09/29 04:51:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run unruly-sheep-332 at: http://localhost:5000/#/experiments/1/runs/0ffcc3dd4e234c4ab59143d122864011.

2024/09/29 04:51:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.602102773763435, mae:1.136347309247458                                  
Experiment ID: 1                                                              
Run ID: 173652b1c30c40b29889dc96b11e3177                                      
 60%|██████    | 3/5 [00:45<00:19,  9.77s/trial, best loss: 5.608076504357662]

2024/09/29 04:51:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run big-moth-828 at: http://localhost:5000/#/experiments/1/runs/173652b1c30c40b29889dc96b11e3177.

2024/09/29 04:51:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.606395014123118, mae:1.1384410683283295                                 
Experiment ID: 1                                                              
Run ID: 11be0a5cbede4e8b88bbf06f8d4a4740                                      
 80%|████████  | 4/5 [00:55<00:12, 12.65s/trial, best loss: 5.602102773763435]

2024/09/29 04:51:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run luminous-koi-850 at: http://localhost:5000/#/experiments/1/runs/11be0a5cbede4e8b88bbf06f8d4a4740.

2024/09/29 04:51:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



100%|██████████| 5/5 [00:55<00:00, 11.19s/trial, best loss: 5.602102773763435]


# Save Best Model

In [None]:
artifact_path = 'models/random_regresor.joblib'
