In [1]:
import pandas as pd

data = pd.read_csv( "./dataset/Liver_disease_data.csv" )
int_columns = data.select_dtypes( include = "int" ).columns
data[ int_columns ] = data[ int_columns ].astype( "float" )

data.head()

Unnamed: 0,Age,Gender,BMI,AlcoholConsumption,Smoking,GeneticRisk,PhysicalActivity,Diabetes,Hypertension,LiverFunctionTest,Diagnosis
0,58.0,0.0,35.857584,17.272828,0.0,1.0,0.65894,0.0,0.0,42.73424,1.0
1,71.0,1.0,30.73247,2.201266,0.0,1.0,1.670557,1.0,0.0,67.309822,1.0
2,48.0,0.0,19.971407,18.500944,0.0,0.0,9.928308,0.0,0.0,63.738956,0.0
3,34.0,1.0,16.615417,12.63287,0.0,0.0,5.630129,0.0,0.0,64.555873,1.0
4,62.0,1.0,16.06583,1.087815,0.0,1.0,3.566218,1.0,0.0,77.868689,1.0


### [Random Forest Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [None]:
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

# print("tracking_uri:", mlflow.get_tracking_uri() )
train_data, test_data = train_test_split( data, test_size = 0.2, random_state=0)

y_train = train_data[ "Diagnosis" ]
X_train = train_data.drop( columns = [ "Diagnosis" ] )

y_test = test_data[ "Diagnosis" ]
X_test = test_data.drop( columns = [ "Diagnosis" ] )

param_grid = {
    "n_estimators": [ 200, 250, 300 ],
    "max_depth": [ 8, 9 ],                # 決策樹的最大深度
    "min_samples_split": [ 5, 6, 7, 8 ],  # 每個節點至少要包含多少個樣本才能繼續分裂
    "min_samples_leaf": [ 2, 3, 4, 5 ],   # 葉節點所需的最小樣本數
}

metrics = [ "recall" ]
grid_search = GridSearchCV( estimator = RandomForestClassifier(  random_state = 42  ),
                            param_grid = param_grid,
                            cv = 5,
                            scoring = metrics, 
                            refit = False,
                            n_jobs = -1 )

grid_search.fit( X_train, y_train )

n = 3
print( f"top {n} results:" )
results_df = pd.DataFrame( grid_search.cv_results_ )
results_df = results_df[ [ "params", f"mean_test_{metrics[ 0 ]}", f"rank_test_{metrics[ 0 ]}"  ] ]
top_results_df = results_df.nsmallest( n, f"rank_test_{metrics[ 0 ]}" )
top_params = top_results_df[ "params" ]
print( top_params )

mlflow.set_experiment( "Random Forest Classifier" )

for params in top_params:
    with mlflow.start_run():

        mlflow.log_input( mlflow.data.from_pandas( X_train ), 
                          context = "train",
                        )
        mlflow.log_params( params )

        model = RandomForestClassifier( **params )
        model.fit( X_train, y_train )

        y_pred = model.predict( X_test )

        accuracy = accuracy_score( y_test, y_pred )
        precision = precision_score( y_test, y_pred )
        recall = recall_score( y_test, y_pred )
        f1 = f1_score( y_test, y_pred )

        print( classification_report( y_test, y_pred ) )
        print( "accuracy:", accuracy, 
               "precision:", precision, 
               "recall:", recall, 
               "f1:", f1 )
      
        mlflow.log_metrics( { "accuracy": accuracy,
                              "recall": recall,
                              "precision": precision,
                              "f1": f1 
                            } )
        
        model_signature = infer_signature( X_train, model.predict( X_train ) )

        artifact_folder = "sk_models"
        model_info = mlflow.sklearn.log_model( sk_model = model, 
                                               artifact_path = artifact_folder,
                                               registered_model_name = "random_forest",
                                               input_example = X_train.iloc[ 0:1 ],
                                               signature = model_signature,
                                               pip_requirements = "requirements.txt" )
        
        artifact_uri = mlflow.get_artifact_uri( artifact_folder )
        print( "artifact uri:", artifact_uri )

mlflow.end_run()

top 3 results:
60    {'max_depth': 9, 'min_samples_leaf': 3, 'min_s...
63    {'max_depth': 9, 'min_samples_leaf': 3, 'min_s...
61    {'max_depth': 9, 'min_samples_leaf': 3, 'min_s...
Name: params, dtype: object
              precision    recall  f1-score   support

         0.0       0.87      0.89      0.88       157
         1.0       0.90      0.89      0.89       183

    accuracy                           0.89       340
   macro avg       0.88      0.89      0.88       340
weighted avg       0.89      0.89      0.89       340

accuracy: 0.8852941176470588 precision: 0.9 recall: 0.8852459016393442 f1: 0.8925619834710744


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 205.11it/s]  
Registered model 'random_forest' already exists. Creating a new version of this model...
Created version '4' of model 'random_forest'.


artifact uri: file:///d:/mlflow-test/mlruns/839770374526227051/da421bc710c94f0b9bab31c31618f28d/artifacts/sk_models
              precision    recall  f1-score   support

         0.0       0.90      0.88      0.89       157
         1.0       0.90      0.91      0.91       183

    accuracy                           0.90       340
   macro avg       0.90      0.90      0.90       340
weighted avg       0.90      0.90      0.90       340

accuracy: 0.8970588235294118 precision: 0.8978494623655914 recall: 0.912568306010929 f1: 0.9051490514905149


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 997.52it/s] 
Registered model 'random_forest' already exists. Creating a new version of this model...
Created version '5' of model 'random_forest'.


artifact uri: file:///d:/mlflow-test/mlruns/839770374526227051/6e4d310c27f84b44878dfba2ea29e1f3/artifacts/sk_models
              precision    recall  f1-score   support

         0.0       0.88      0.86      0.87       157
         1.0       0.88      0.90      0.89       183

    accuracy                           0.88       340
   macro avg       0.88      0.88      0.88       340
weighted avg       0.88      0.88      0.88       340

accuracy: 0.8794117647058823 precision: 0.8817204301075269 recall: 0.8961748633879781 f1: 0.8888888888888888


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1015.92it/s] 


artifact uri: file:///d:/mlflow-test/mlruns/839770374526227051/79c3f5b6ac0e467c8a430862a50aad98/artifacts/sk_models


Registered model 'random_forest' already exists. Creating a new version of this model...
Created version '6' of model 'random_forest'.
