In [1]:
import pandas as pd

data = pd.read_csv( "Liver_disease_data.csv" )
data.head()

Unnamed: 0,Age,Gender,BMI,AlcoholConsumption,Smoking,GeneticRisk,PhysicalActivity,Diabetes,Hypertension,LiverFunctionTest,Diagnosis
0,58,0,35.857584,17.272828,0,1,0.65894,0,0,42.73424,1
1,71,1,30.73247,2.201266,0,1,1.670557,1,0,67.309822,1
2,48,0,19.971407,18.500944,0,0,9.928308,0,0,63.738956,0
3,34,1,16.615417,12.63287,0,0,5.630129,0,0,64.555873,1
4,62,1,16.06583,1.087815,0,1,3.566218,1,0,77.868689,1


### [DecisionTreeClassifier](https://scikit-learn.org/1.6/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [None]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report

# mlflow.autolog()

train_data, test_data = train_test_split( data, test_size = 0.2, random_state = 0 )
target_column = "Diagnosis"

y_train = train_data[ target_column ]
X_train = train_data.drop( columns = [ target_column ] )

y_test = test_data[ target_column ]
X_test = test_data.drop( columns = [ target_column ] )

param_grid = {
    "max_depth": [ 8, 9, 10 ],   # 決策樹的最大深度
    "min_samples_split": [ 2, 5, 6, 7, 8, 9, 10 ],  # 每個節點至少要包含多少個樣本才能繼續分裂
    "min_samples_leaf": [ 2, 3, 4, 5, 6 ],
}

metrics = [ "recall", "f1" ]
grid_search = GridSearchCV( estimator = DecisionTreeClassifier( random_state = 42 ),
                            param_grid = param_grid,
                            cv = 5,
                            scoring = metrics, 
                            refit = metrics[ 0 ], # refit the best model using the first metric in the list
                            n_jobs = -1 )

grid_search.fit( X_train, y_train )

print( f"best params: {grid_search.best_params_}" )
print( f"best score: {grid_search.best_score_:.4f}" )


mlflow.set_experiment( "Decision Tree Classifier" )

candidate_models = []
with mlflow.start_run():
         
    mlflow.log_input( mlflow.data.from_pandas( X_train ), context = "train" )
    mlflow.log_params( grid_search.best_params_ )

    model = grid_search.best_estimator_
   
    y_pred = model.predict( X_test )

    accuracy = accuracy_score( y_test, y_pred )
    precision = precision_score( y_test, y_pred )
    recall = recall_score( y_test, y_pred )
    f1 = f1_score( y_test, y_pred )
    print( classification_report( y_test, y_pred ) )
    print( "Accuracy:", accuracy, "Precision:", precision, "Recall:", recall, "F1-Score:", f1 )
    
    mlflow.log_metrics( 
        { "Accuracy": accuracy,
          "Recall": recall,
          "Precision": precision,
          "F1-Score": f1,
        } 
    )
    
    artifact_folder = "model"
    mlflow.sklearn.log_model( sk_model = model, 
                              artifact_path = artifact_folder,
                              registered_model_name = "decision_tree_model",
                              input_example = X_train.iloc[ 0:1 ],
                              pip_requirements = "requirements.txt" )
    
    artifact_uri = mlflow.get_artifact_uri( artifact_folder )
    print( "artifact uri:", artifact_uri )

    candidate_models.append( model )
    
mlflow.end_run()

best params: {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2, 'random_state': 42}
best score: 0.8673




              precision    recall  f1-score   support

           0       0.79      0.80      0.80       157
           1       0.83      0.82      0.82       183

    accuracy                           0.81       340
   macro avg       0.81      0.81      0.81       340
weighted avg       0.81      0.81      0.81       340

Accuracy: 0.8117647058823529 Precision: 0.8287292817679558 Recall: 0.819672131147541 F1-Score: 0.8241758241758241


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1772.95it/s] 


artifact uri: file:///d:/mlflow-test/mlruns/511301782627813970/32f95e8a3467431eb6ada0879c7750b0/artifacts/model


Registered model 'decision_tree_model' already exists. Creating a new version of this model...
Created version '23' of model 'decision_tree_model'.


### HF upload folder

In [11]:
import os
from pathlib import Path
from huggingface_hub import login
from huggingface_hub import HfApi

if os.path.exists( ".env" ):
    from dotenv import load_dotenv
    load_dotenv()

login( os.getenv( "HF_TOKEN" ) )
api = HfApi()

REPO_ID = "byckg3/gad245-g1-01"
FOLDER_PATH = artifact_uri.replace( "file:///", "" )
REPO_PATH = f"liver/sklearn/decision_tree/01"
print( FOLDER_PATH )

file_path = Path( FOLDER_PATH )

# 檢查是否存在
if file_path.exists() and file_path.is_dir():
    print( "folder exists" )
    api.upload_folder(
        folder_path = FOLDER_PATH,
        path_in_repo = REPO_PATH,
        repo_id= REPO_ID,
    )
else:
    print( "folder not exists" )

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


d:/mlflow-test/mlruns/511301782627813970/32f95e8a3467431eb6ada0879c7750b0/artifacts/model
folder exists


No files have been modified since last commit. Skipping to prevent empty commit.


In [15]:
print( model.feature_names_in_ )
print( model.classes_ )
print( model.feature_importances_ )

['Age' 'Gender' 'BMI' 'AlcoholConsumption' 'Smoking' 'GeneticRisk'
 'PhysicalActivity' 'Diabetes' 'Hypertension' 'LiverFunctionTest']
[0 1]
[0.09417673 0.05976493 0.0860782  0.24470218 0.05975052 0.05060517
 0.059652   0.01351021 0.05194181 0.27981824]


In [None]:
from sklearn import tree
text_representation = tree.export_text(model)
print(text_representation)

In [None]:
from matplotlib import pyplot as plt

fig = plt.figure( figsize = ( 25, 50 ), dpi = 300 )
_ = tree.plot_tree( model, 
                    feature_names = model.feature_names_in_,  
                    class_names = [ str(c) for c in model.classes_ ],
                    filled = True )