In [16]:
import pandas as pd 
import numpy as np
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection

from functools import partial
import optuna
import yaml


def optimize(trial, x, y, config):
    criterion = trial.suggest_categorical("criterion", config['rf']['criterion'])
    n_estimators = trial.suggest_int("n_estimators", config['rf']['n_estimators']['min'], config['rf']['n_estimators']['max'])
    max_depth = trial.suggest_int("max_depth", config['rf']['max_depth']['min'], config['rf']['max_depth']['max'])
    max_features = trial.suggest_uniform("max_features", config['rf']['max_features']['min'], config['rf']['max_features']['max'])

    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        criterion=criterion
    )
    
    kf = model_selection.StratifiedKFold(n_splits=5)
    accuracies = []
    for idx in kf.split(X=x, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = x[train_idx]
        ytrain = y[train_idx]

        xtest = x[test_idx]
        ytest = y[test_idx]

        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_acc = metrics.accuracy_score(ytest, preds)
        accuracies.append(fold_acc)

    return -1.0 * np.mean(accuracies)


if __name__ == "__main__":
    # Step 1: Parse YAML configuration
    with open("config.yaml") as f:
        config = yaml.safe_load(f)

    # Step 2: Prepare data
    df = pd.read_csv("/Users/yunbo/Documents/GitHub/Machine-learning-learning-and-code-practice/Hyper_parameter_tuning/input/train.csv")
    X = df.drop("price_range", axis=1).values
    y = df.price_range.values

    # Step 3: Create optimization function with Optuna parameters
    optimization_function = partial(optimize, x=X, y=y, config=config['parameters'])

    # Step 4: Perform optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(optimization_function, n_trials=config['parameters']['optuna']['n_trials'])

    


[I 2024-02-08 17:27:37,471] A new study created in memory with name: no-name-ca34b3b5-062b-45a6-90a2-82be0206966f
  max_features = trial.suggest_uniform("max_features", config['rf']['max_features']['min'], config['rf']['max_features']['max'])
[I 2024-02-08 17:32:38,707] Trial 0 finished with value: -0.7685000000000001 and parameters: {'criterion': 'gini', 'n_estimators': 884, 'max_depth': 8, 'max_features': 0.026412213650103274}. Best is trial 0 with value: -0.7685000000000001.
  max_features = trial.suggest_uniform("max_features", config['rf']['max_features']['min'], config['rf']['max_features']['max'])
[I 2024-02-08 17:32:51,730] Trial 1 finished with value: -0.8675 and parameters: {'criterion': 'entropy', 'n_estimators': 1118, 'max_depth': 5, 'max_features': 0.5470111119610851}. Best is trial 1 with value: -0.8675.
  max_features = trial.suggest_uniform("max_features", config['rf']['max_features']['min'], config['rf']['max_features']['max'])
[I 2024-02-08 17:32:56,117] Trial 2 finis

In [15]:
import pandas as pd 
import numpy as np
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import classification_report, confusion_matrix
import optuna
import yaml
import pickle

def optimize(trial, x, y, config):
    criterion = trial.suggest_categorical("criterion", config['rf']['criterion'])
    n_estimators = trial.suggest_int("n_estimators", config['rf']['n_estimators']['min'], config['rf']['n_estimators']['max'])
    max_depth = trial.suggest_int("max_depth", config['rf']['max_depth']['min'], config['rf']['max_depth']['max'])
    max_features = trial.suggest_uniform("max_features", config['rf']['max_features']['min'], config['rf']['max_features']['max'])

    model = ensemble.RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        criterion=criterion
    )
    
    kf = model_selection.StratifiedKFold(n_splits=5)
    accuracies = []
    for idx in kf.split(X=x, y=y):
        train_idx, test_idx = idx[0], idx[1]
        xtrain = x[train_idx]
        ytrain = y[train_idx]

        xtest = x[test_idx]
        ytest = y[test_idx]

        model.fit(xtrain, ytrain)
        preds = model.predict(xtest)
        fold_acc = metrics.accuracy_score(ytest, preds)
        accuracies.append(fold_acc)

    mean_accuracy = -1.0 * np.mean(accuracies)
    
    # Update best model and its accuracy
    global best_accuracy, best_model
    if mean_accuracy < best_accuracy:
        best_accuracy = mean_accuracy
        best_model = model
    
    return mean_accuracy


if __name__ == "__main__":
    # Step 1: Parse YAML configuration
    with open("config.yaml") as f:
        config = yaml.safe_load(f)

    # Step 2: Prepare data
    df = pd.read_csv("/Users/yunbo/Documents/GitHub/Machine-learning-learning-and-code-practice/Hyper_parameter_tuning/input/train.csv")
    X = df.drop("price_range", axis=1).values
    y = df.price_range.values

    # Step 3: Create optimization function with Optuna parameters
    best_accuracy = float('inf')
    best_model = None
    optimization_function = partial(optimize, x=X, y=y, config=config['parameters'])

    # Step 4: Perform optimization
    study = optuna.create_study(direction="minimize")
    study.optimize(optimization_function, n_trials=config['parameters']['optuna']['n_trials'])

    # Step 5: Get best model and evaluate it
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
    y_predict = best_model.predict(X_test)
    print(classification_report(y_test, y_predict))
    print(confusion_matrix(y_test, y_predict))

    # Step 6: Save the best model
    with open("best_model.pkl", "wb") as f:
        pickle.dump(best_model, f)


[I 2024-02-08 14:24:00,544] A new study created in memory with name: no-name-3848a6d2-1256-4cf2-b9c2-a58f625272de


  max_features = trial.suggest_uniform("max_features", config['rf']['max_features']['min'], config['rf']['max_features']['max'])
[I 2024-02-08 14:24:10,958] Trial 0 finished with value: -0.901 and parameters: {'criterion': 'entropy', 'n_estimators': 461, 'max_depth': 7, 'max_features': 0.9436043750754869}. Best is trial 0 with value: -0.901.
  max_features = trial.suggest_uniform("max_features", config['rf']['max_features']['min'], config['rf']['max_features']['max'])
[I 2024-02-08 14:24:18,975] Trial 1 finished with value: -0.843 and parameters: {'criterion': 'gini', 'n_estimators': 1280, 'max_depth': 11, 'max_features': 0.12326386687138441}. Best is trial 0 with value: -0.901.
  max_features = trial.suggest_uniform("max_features", config['rf']['max_features']['min'], config['rf']['max_features']['max'])
[I 2024-02-08 14:24:21,431] Trial 2 finished with value: -0.8959999999999999 and parameters: {'criterion': 'gini', 'n_estimators': 174, 'max_depth': 10, 'max_features': 0.466906406912

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       105
           1       0.96      0.99      0.97        91
           2       0.99      0.95      0.97        92
           3       0.98      0.99      0.99       112

    accuracy                           0.98       400
   macro avg       0.98      0.98      0.98       400
weighted avg       0.98      0.98      0.98       400

[[104   1   0   0]
 [  1  90   0   0]
 [  0   3  87   2]
 [  0   0   1 111]]


In [17]:
# load model from pickle file
model_pkl_file = 'best_model.pkl'
with open(model_pkl_file, 'rb') as file:  
    model = pickle.load(file)

df = pd.read_csv("/Users/yunbo/Documents/GitHub/Machine-learning-learning-and-code-practice/Hyper_parameter_tuning/input/test.csv")
X_test = df.drop("price_range", axis=1).values
y_test = df.price_range.values

# evaluate model 
y_predict = model.predict(X_test)

# check results
print(classification_report(y_test, y_predict)) 

KeyError: "['price_range'] not found in axis"

In [None]:
# load dependencies
import onnxmltools
import onnxruntime

# Assuming you have already loaded and prepared the best_model

# Convert the RandomForestClassifier model to ONNX format
onnx_model = onnxmltools.convert_sklearn(best_model)

# Save the ONNX model in a file
onnx_file = "random_forest_model.onnx"
onnxmltools.utils.save_model(onnx_model, onnx_file)


if you're working within the Python ecosystem and only need to save and load models within Python environments, Pickle might be sufficient. However, if you need interoperability with other frameworks or want to deploy your model in production environments that support ONNX, converting your model to ONNX format might be more suitable. Additionally, ONNX is particularly advantageous for deep learning models that need to be deployed in different frameworks.