In [15]:
!python -V

Python 3.12.1


In [16]:
%cd ..

/workspaces


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [17]:
import os

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

from joblib import dump

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope

import mlflow

In [18]:
random_state = 42


# Configuring MLflow

In [19]:
TRACKING_SERVER_HOST = "localhost"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
mlflow.set_experiment("student_performance")


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1727577653830, experiment_id='1', last_update_time=1727577653830, lifecycle_stage='active', name='student_performance', tags={'project': 'Student Performance', 'task': 'Regression'}>

In [20]:
mlflow.set_experiment_tags(
    {
    "project" : "Student Performance",
    "task": "Regression"
    }
)
print(f"tracking URI: {mlflow.get_tracking_uri()}")

tracking URI: http://localhost:5000


# Train Random Regressor

In [21]:
def train(data_path, max_depth, max_features, n_estimators, random_state, split_test_size=0.33):
    run = mlflow.start_run()

    mlflow.log_param("train_data_path", data_path)

    students_df = pd.read_csv(data_path)

    X = students_df.drop(columns=['Exam_Score'])
    y = students_df.loc[:, 'Exam_Score']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, shuffle=True, random_state=random_state)

    rf_regressor = RandomForestRegressor(max_depth=max_depth, max_features=max_features, n_estimators=n_estimators)
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    data_params = {
        'split_test_size': split_test_size,
        'random_state': random_state,
        'x_train_shape': X_train.shape,
        'x_test_shape': X_test.shape,
        'y_train_shape': y_train.shape,
        'y_test_shape': y_test.shape
    }

    mlflow.log_params(data_params)

    ml_params = {
        f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
    }
    mlflow.log_params(ml_params)
    
    ml_metrics = {'mse':mse, 'mae': mae}
    mlflow.log_metrics(ml_metrics)
    print(f'mse:{mse}, mae:{mae}')

    with open("models/random_regresor.joblib", "wb") as f:
        dump(rf_regressor, f, protocol=5)

    mlflow.set_tag('model', 'random-forest-regressor')

    print(f"Experiment ID: {run.info.experiment_id}")
    print(f"Run ID: {run.info.run_id}")

    mlflow.end_run()
    return rf_regressor


In [22]:
data_path = 'data/processed/StudentPerformanceFactors.csv'

max_depth = 8
max_features = 0.75
n_estimators = 200
train(data_path=data_path, max_depth=max_depth, max_features=max_features, n_estimators=n_estimators, random_state=random_state)

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/StudentPerformanceFactors.csv'

# Hyperparameter Optimization

In [11]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

def objective(params):
    with mlflow.start_run() as run:
        mlflow.log_param("train_data_path", data_path)

        students_df = pd.read_csv(data_path)

        X = students_df.drop(columns=['Exam_Score'])
        y = students_df.loc[:, 'Exam_Score']
        
        rf_regressor = RandomForestRegressor(**params)

        mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # Set greater_is_better=False to minimize
        mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)    

        scoring = {
            'mse': mse_scorer,
            'mae': mae_scorer
        }
        scores = cross_validate(rf_regressor, X, y, scoring=scoring, cv=5)
        
        mse = -scores['test_mse'].mean()  # Negate MSE for consistency
        mae = -scores['test_mae'].mean()

        data_params = {
            'random_state': random_state,
        }

        mlflow.log_params(data_params)

        ml_params = {
            f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
        }
        mlflow.log_params(ml_params)
        
        ml_metrics = {'mse':mse, 'mae': mae}
        mlflow.log_metrics(ml_metrics)
        print(f'mse:{mse}, mae:{mae}')

        mlflow.set_tag('model', 'random-forest-regressor')

        print(f"Experiment ID: {run.info.experiment_id}")
        print(f"Run ID: {run.info.run_id}")

        mlflow.end_run()

        return {'loss': mse, 'status': STATUS_OK, 'params': params}
    


In [12]:
search_space = {
    'max_depth': scope.int(hp.choice('max_depth', [10, 20, 30])),
    'max_features': hp.choice('max_features', ['sqrt','log2']),
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 300, 50)),
}

In [13]:
trials = Trials()
best = fmin(fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=5,
            trials=trials
            )

best_params = trials.best_trial['result']['params']

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

mse:5.979280630289077, mae:1.2589398955195863        
Experiment ID: 1                                     
Run ID: 3868dc2d5a334c3b85f8fe578cced721             
  0%|          | 0/5 [00:03<?, ?trial/s, best loss=?]

2024/10/08 14:00:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run resilient-cod-321 at: http://localhost:5000/#/experiments/1/runs/3868dc2d5a334c3b85f8fe578cced721.

2024/10/08 14:00:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.57721188559191, mae:1.1319550112176533                                  
Experiment ID: 1                                                              
Run ID: 6c79ef33d77942faac66c4e45602eed1                                      
 20%|██        | 1/5 [00:10<00:13,  3.43s/trial, best loss: 5.979280630289077]

2024/10/08 14:00:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run smiling-moose-578 at: http://localhost:5000/#/experiments/1/runs/6c79ef33d77942faac66c4e45602eed1.

2024/10/08 14:00:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.7554867444263795, mae:1.1841349956685163                                
Experiment ID: 1                                                             
Run ID: 1da439f0d32a4acaa96f95cf5b922141                                     
 40%|████      | 2/5 [00:12<00:16,  5.48s/trial, best loss: 5.57721188559191]

2024/10/08 14:00:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run dazzling-mouse-781 at: http://localhost:5000/#/experiments/1/runs/1da439f0d32a4acaa96f95cf5b922141.

2024/10/08 14:00:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:6.0074507074978545, mae:1.2676651583128637                               
Experiment ID: 1                                                             
Run ID: ed99bfae886b4e9582873faa8825b648                                     
 60%|██████    | 3/5 [00:14<00:07,  3.86s/trial, best loss: 5.57721188559191]

2024/10/08 14:00:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-pug-430 at: http://localhost:5000/#/experiments/1/runs/ed99bfae886b4e9582873faa8825b648.

2024/10/08 14:00:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



mse:5.6100746761624745, mae:1.1322567876118472                               
Experiment ID: 1                                                             
Run ID: 3fbd833d180940ea81c22228675b402a                                     
 80%|████████  | 4/5 [00:24<00:03,  3.21s/trial, best loss: 5.57721188559191]

2024/10/08 14:00:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run receptive-moose-954 at: http://localhost:5000/#/experiments/1/runs/3fbd833d180940ea81c22228675b402a.

2024/10/08 14:00:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.



100%|██████████| 5/5 [00:24<00:00,  4.95s/trial, best loss: 5.57721188559191]


# Save Best Model

In [14]:
from mlflow.models.signature import infer_signature

artifact_path = 'models/random_regresor.joblib'


with mlflow.start_run() as run:
    mlflow.log_param("train_data_path", data_path)

    students_df = pd.read_csv(data_path)

    X = students_df.drop(columns=['Exam_Score']).astype('float64')
    y = students_df.loc[:, 'Exam_Score'].astype('float64')

    rf_regressor = RandomForestRegressor(**best_params)

    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # Set greater_is_better=False to minimize
    mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)    

    scoring = {
        'mse': mse_scorer,
        'mae': mae_scorer
    }
    scores = cross_validate(rf_regressor, X, y, scoring=scoring, cv=5)

    mse = -scores['test_mse'].mean()  # Negate MSE for consistency
    mae = -scores['test_mae'].mean()

    data_params = {
        'random_state': random_state,
    }

    mlflow.log_params(data_params)

    ml_params = {
        f"rfr_{param}": value for param, value in rf_regressor.get_params().items()
    }
    mlflow.log_params(ml_params)

    ml_metrics = {'mse':mse, 'mae': mae}
    mlflow.log_metrics(ml_metrics)
    print(f'mse:{mse}, mae:{mae}')

    mlflow.set_tag('model', 'random-forest-regressor')

    rf_regressor.fit(X,y)

    input_example = X.sample(1)
    signature = infer_signature(X, y)

    mlflow.sklearn.log_model(rf_regressor, artifact_path=artifact_path, signature=signature, input_example=input_example)
    artifact_uri = mlflow.get_artifact_uri()
    print(f'artifact_uri: {artifact_uri}')

    print(f"Experiment ID: {run.info.experiment_id}")
    print(f"Run ID: {run.info.run_id}")


mse:5.593694330655268, mae:1.1348223045464791


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:02<00:00,  2.58it/s]
2024/10/08 14:01:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-frog-474 at: http://localhost:5000/#/experiments/1/runs/75c72acf704a4132a57994517e9f8b4b.
2024/10/08 14:01:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.


artifact_uri: mlflow-artifacts:/1/75c72acf704a4132a57994517e9f8b4b/artifacts
Experiment ID: 1
Run ID: 75c72acf704a4132a57994517e9f8b4b
