# 1. Import scripts

In [1]:
import sys
sys.path.append("../scripts")
import importlib
import preprocess_data, visualise, train, evaluate

importlib.reload(preprocess_data)
importlib.reload(visualise)
importlib.reload(evaluate)
importlib.reload(train)

<module 'train' from 'c:\\Users\\aghil\\OneDrive\\Bureau\\PredectiveMassMLOps\\mlflow\\../scripts\\train.py'>

In [2]:
from preprocess_data import *
from visualise import *
from evaluate import *
from train import *

# 2. Load and preprocess data

In [3]:
file_path = '../data/raw/data_Mass.xlsx'  
df = load_dataset(file_path)

df = drop_na(df)

In [4]:
features = ['sen1', 'sen2', 'sen3', 'sen4', 'sen5', 'sen6', 'sen7', 'sen8', 'sen9', 'sen10', 'sen11', 'sen12', 'sen13', 'sen14', 'sen15']
X_train, X_test, X_val, y_train, y_test, y_val = split_data(df, features, 0.2, 4)

# 3. Models

In [5]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_squared_error, r2_score

## 3.1. Linear Regressor

In [6]:
# Create and train the model
model_lr = train_lr(X_train, y_train, "../models/lr.joblib")

In [7]:
# Evaluate the model on test
print("Mean Squared Error : ", evaluate(model_lr, X_test, y_test, mean_squared_error))
print("R2 Score : ", evaluate(model_lr, X_test, y_test, r2_score))

Mean Squared Error :  3917.3760937481557
R2 Score :  0.9130703668471682


In [8]:
# Evaluate the model on val
print("Mean Absolute Error : ", evaluate(model_lr, X_val, y_val, mean_squared_error))
print("R2 Score : ", evaluate(model_lr, X_val, y_val, r2_score))

Mean Absolute Error :  11875.311203180707
R2 Score :  0.6995013763252966


In [9]:
# Log model to mlflow
with mlflow.start_run(run_name="LinearRegressor"):
    mlflow.log_params(model_lr.get_params())

    # Evaluate on test set
    r2_lr_test = evaluate(model_lr, X_test, y_test, r2_score)
    mse_lr_test = evaluate(model_lr, X_test, y_test, mean_squared_error)

    # Evaluate on validation set
    r2_lr_val = evaluate(model_lr, X_val, y_val, r2_score)
    mse_lr_val = evaluate(model_lr, X_val, y_val, mean_squared_error)

    # Log metrics for test set
    mlflow.log_metric("R2_Test", r2_lr_test)
    mlflow.log_metric("MSE_Test", mse_lr_test)

    # Log metrics for validation set
    mlflow.log_metric("R2_Val", r2_lr_val)
    mlflow.log_metric("MSE_Val", mse_lr_val)

    mlflow.sklearn.log_model(model_lr, "Linear Regressor")
mlflow.end_run()

## 3.2. Gradient Boosting Regressor

In [10]:
# Define the hyperparameter grid to search
param = {
    'n_estimators': [50, 100, 150]
    # 'learning_rate': [0.01, 0.1, 0.2],
    # 'max_depth': [3, 4, 5],
    # 'min_samples_split': [2, 5, 10],
    # 'min_samples_leaf': [1, 2, 4]
    # 'subsample': [0.8, 1.0],
    # 'max_features': [None, 'sqrt', 'log2'],
    # 'alpha': [0.9, 0.7, 0.5],
    # 'warm_start': [True, False]
}

# Create and train the model
model_gbr = train_gradient_boosting_regressor(X_train, y_train, param, "../models/gbr.joblib")

Best hyperparameters: {'n_estimators': 150}


In [11]:
# Evaluate the model on test
print("Mean Absolute Error : ", evaluate(model_gbr, X_test, y_test, mean_squared_error))
print("R2 Score : ", evaluate(model_gbr, X_test, y_test, r2_score))

Mean Absolute Error :  5711.935222219343
R2 Score :  0.8732476990777829


In [12]:
# Evaluate the model on val
print("Mean Absolute Error : ", evaluate(model_gbr, X_val, y_val, mean_squared_error))
print("R2 Score : ", evaluate(model_gbr, X_val, y_val, r2_score))

Mean Absolute Error :  2706.4217938914776
R2 Score :  0.9315153927141058


In [13]:
# Log model to mlflow
with mlflow.start_run(run_name="GradientBoostingRegressor"):
    mlflow.log_params(model_gbr.get_params())

    # Evaluate on test set
    r2_gbr_test = evaluate(model_gbr, X_test, y_test, r2_score)
    mse_gbr_test = evaluate(model_gbr, X_test, y_test, mean_squared_error)

    # Evaluate on validation set
    r2_gbr_val = evaluate(model_gbr, X_val, y_val, r2_score)
    mse_gbr_val = evaluate(model_gbr, X_val, y_val, mean_squared_error)

    # Log metrics for test set
    mlflow.log_metric("R2_Test", r2_gbr_test)
    mlflow.log_metric("MSE_Test", mse_gbr_test)

    # Log metrics for validation set
    mlflow.log_metric("R2_Val", r2_gbr_val)
    mlflow.log_metric("MSE_Val", mse_gbr_val)

    mlflow.sklearn.log_model(model_gbr, "Gradient Boosting Regressor")
mlflow.end_run()



## 3.3. Xgboost Regressor

In [14]:
# Define parameters n_estimators and maxdepth 
param = [50,5]

#create and train the model 
model_xgbr = train_xgboost(X_train, y_train,param)

In [15]:
# Evaluate the model on test
print("Mean Absolute Error : ", evaluate(model_xgbr, X_test, y_test, mean_squared_error))
print("R2 Score : ", evaluate(model_xgbr, X_test, y_test, r2_score))

Mean Absolute Error :  8571.5332770257
R2 Score :  0.8097909862373719


In [16]:
# Evaluate the model on val
print("Mean Absolute Error : ", evaluate(model_xgbr, X_val, y_val, mean_squared_error))
print("R2 Score : ", evaluate(model_xgbr, X_val, y_val, r2_score))

Mean Absolute Error :  1159.830313946586
R2 Score :  0.9706510922472669


In [17]:
# Log model to mlflow
with mlflow.start_run(run_name="XgboostRegressor"):
    mlflow.log_params(model_xgbr.get_params())

    # Evaluate on test set
    r2_xgbr_test = evaluate(model_xgbr, X_test, y_test, r2_score)
    mse_xgbr_test = evaluate(model_xgbr, X_test, y_test, mean_squared_error)

    # Evaluate on validation set
    r2_xgbr_val = evaluate(model_xgbr, X_val, y_val, r2_score)
    mse_xgbr_val = evaluate(model_xgbr, X_val, y_val, mean_squared_error)

    # Log metrics for test set
    mlflow.log_metric("R2_Test", r2_xgbr_test)
    mlflow.log_metric("MSE_Test", mse_xgbr_test)

    # Log metrics for validation set
    mlflow.log_metric("R2_Val", r2_xgbr_val)
    mlflow.log_metric("MSE_Val", mse_xgbr_val)

    mlflow.sklearn.log_model(model_xgbr, "Xgboost Regressor")
mlflow.end_run()



## 3.4 Random Forest Regressor

In [18]:
# Define the hyperparameter grid to search
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create and train the model
model_rf = train_random_forest_regressor(X_train, y_train, params, file='../models/rf.joblib')

Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [19]:
# Evaluate the model on test
print("Mean Absolute Error : ", evaluate(model_rf, X_test, y_test, mean_squared_error))
print("R2 Score : ", evaluate(model_rf, X_test, y_test, r2_score))

Mean Absolute Error :  8291.666261904766
R2 Score :  0.8160014537477209


In [20]:
# Evaluate the model on val
print("Mean Absolute Error : ", evaluate(model_rf, X_val, y_val, mean_squared_error))
print("R2 Score : ", evaluate(model_rf, X_val, y_val, r2_score))

Mean Absolute Error :  5683.426349999998
R2 Score :  0.8561838282205096


In [21]:
# Log model to mlflow
with mlflow.start_run(run_name="RandomForestRegressor"):
    mlflow.log_params(model_rf.get_params())

    # Evaluate on test set
    r2_rf_test = evaluate(model_rf, X_test, y_test, r2_score)
    mse_rf_test = evaluate(model_rf, X_test, y_test, mean_squared_error)

    # Evaluate on validation set
    r2_rf_val = evaluate(model_rf, X_val, y_val, r2_score)
    mse_rf_val = evaluate(model_rf, X_val, y_val, mean_squared_error)

    # Log metrics for test set
    mlflow.log_metric("R2_Test", r2_rf_test)
    mlflow.log_metric("MSE_Test", mse_rf_test)

    # Log metrics for validation set
    mlflow.log_metric("R2_Val", r2_rf_val)
    mlflow.log_metric("MSE_Val", mse_rf_val)

    mlflow.sklearn.log_model(model_rf, "RandomForestRegressor")
mlflow.end_run()

