In [1]:
import pandas as pd

data = pd.read_csv( "./dataset/Liver_disease_data.csv" )
data.head()

Unnamed: 0,Age,Gender,BMI,AlcoholConsumption,Smoking,GeneticRisk,PhysicalActivity,Diabetes,Hypertension,LiverFunctionTest,Diagnosis
0,58,0,35.857584,17.272828,0,1,0.65894,0,0,42.73424,1
1,71,1,30.73247,2.201266,0,1,1.670557,1,0,67.309822,1
2,48,0,19.971407,18.500944,0,0,9.928308,0,0,63.738956,0
3,34,1,16.615417,12.63287,0,0,5.630129,0,0,64.555873,1
4,62,1,16.06583,1.087815,0,1,3.566218,1,0,77.868689,1


Standard Scaler

In [2]:
from sklearn.preprocessing import StandardScaler

scale_columns = [ "Age", "BMI", "AlcoholConsumption", "PhysicalActivity", "LiverFunctionTest" ]
scaler = StandardScaler()

data[ scale_columns] = scaler.fit_transform( data[ scale_columns] )
data.head()

Unnamed: 0,Age,Gender,BMI,AlcoholConsumption,Smoking,GeneticRisk,PhysicalActivity,Diabetes,Hypertension,LiverFunctionTest,Diagnosis
0,0.431253,0,1.131724,1.292704,0,1,-1.525542,0,0,-0.745107,1
1,1.168351,1,0.42072,-1.325806,0,1,-1.170119,1,0,0.323885,1
2,-0.135746,0,-1.072155,1.506075,0,0,1.731169,0,0,0.168559,0
3,-0.929544,1,-1.53773,0.486565,0,0,0.221042,0,0,0.204093,1
4,0.658052,1,-1.613973,-1.519255,0,1,-0.504096,1,0,0.783176,1


### [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [5]:
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.metrics import classification_report


train_data, test_data = train_test_split( data, test_size = 0.2, random_state = 0 )

y_train = train_data[ "Diagnosis" ]
X_train = train_data.drop( columns = [ "Diagnosis" ] )

y_test = test_data[ "Diagnosis" ]
X_test = test_data.drop( columns = [ "Diagnosis" ] )

param_grid = {
    "C": [ 0.001, 0.01, 0.1, 1, 10, 100 ],      # 正則化強度
    "penalty": [ "l1", "l2" ],                  # L1 或 L2 正則化
    "solver": [ "liblinear", "newton-cg", "lbfgs", "saga" ],  # liblinear 支援 L1 & L2
    "max_iter": [ 100, 200, 500 ],               # 設定最大迭代次數
}

metrics = [ "roc_auc" ]
grid_search = GridSearchCV( estimator = LogisticRegression( random_state = 42 ),
                            param_grid = param_grid,
                            cv = 5,  # 5-fold 
                            scoring = metrics, 
                            refit = False,
                            n_jobs = -1 )

grid_search.fit( X_train, y_train )

n = 2
print( f"top {n} results:" )

results_df = pd.DataFrame( grid_search.cv_results_ )
results_df = results_df[ [ "params", f"mean_test_{metrics[ 0 ]}", f"rank_test_{metrics[ 0 ]}" ] ]

top_results_df = results_df.nsmallest( n, f"rank_test_{metrics[ 0 ]}" )
top_params = top_results_df[ "params" ]
print( top_params )

mlflow.set_experiment( "Logistic Regression( liver dataset )" )

candidate_models = []
for params in top_params:
    with mlflow.start_run():

        mlflow.log_input( mlflow.data.from_pandas( X_train ), context = "train" )
        mlflow.log_params( params )

        model = LogisticRegression( **params )
        model.fit( X_train, y_train )

        y_pred = model.predict( X_test )
        y_pred_prob = model.predict_proba( X_test )[ :, 1 ] # 取所有列的 第 2 欄( 類別 1 的機率 )

        # fpr = FP / ( FP + TN ), tpr = TP / ( TP + FN )
        fpr, tpr, thresholds = roc_curve( y_test, y_pred_prob )
        roc_auc = auc( fpr, tpr ) # 計算 AUC（Area Under Curve）

        accuracy = accuracy_score( y_test, y_pred )
        precision = precision_score( y_test, y_pred )
        recall = recall_score( y_test, y_pred )
        f1 = f1_score( y_test, y_pred )

        print( classification_report( y_test, y_pred ) )
        print( "auc:", roc_auc,
               "accuracy:", accuracy, 
               "precision:", precision, 
               "recall:", recall, 
               "f1:", f1 )

        mlflow.log_metrics( 
            { "auc": roc_auc,
              "accuracy": accuracy,
              "recall": recall,
              "precision": precision,
              "f1": f1,
            } 
        )

        model_signature = infer_signature( X_train, model.predict( X_train ) )
  
        artifact_folder = "sk_models"
        mlflow.sklearn.log_model( sk_model = model,
                                  artifact_path = artifact_folder,
                                  registered_model_name = "logistic_regression_model",
                                  signature = model_signature,
                                  input_example = X_train.iloc[ 0:1 ],
                                  pip_requirements = "requirements.txt" )
        
        artifact_uri = mlflow.get_artifact_uri( artifact_folder )
        print( "artifact uri:", artifact_uri )
        
        candidate_models.append( model )

mlflow.end_run()

180 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "d:\mlflow-test\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\mlflow-test\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\mlflow-test\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "d:\mlflow-test\.venv\Lib\site-packages\sklearn\linear_mod

top 2 results:
78    {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'so...
86    {'C': 1, 'max_iter': 200, 'penalty': 'l2', 'so...
Name: params, dtype: object




              precision    recall  f1-score   support

           0       0.82      0.81      0.82       157
           1       0.84      0.85      0.85       183

    accuracy                           0.83       340
   macro avg       0.83      0.83      0.83       340
weighted avg       0.83      0.83      0.83       340

auc: 0.9176499251679371 accuracy: 0.8323529411764706 precision: 0.8387096774193549 recall: 0.8524590163934426 f1: 0.8455284552845529


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1439.65it/s] 
Registered model 'logistic_regression_model' already exists. Creating a new version of this model...
Created version '5' of model 'logistic_regression_model'.


artifact uri: file:///d:/mlflow-test/mlruns/442757179411113851/2992c19677a244c4a71cc170cb47545b/artifacts/sk_models
              precision    recall  f1-score   support

           0       0.82      0.81      0.82       157
           1       0.84      0.85      0.85       183

    accuracy                           0.83       340
   macro avg       0.83      0.83      0.83       340
weighted avg       0.83      0.83      0.83       340

auc: 0.9176499251679371 accuracy: 0.8323529411764706 precision: 0.8387096774193549 recall: 0.8524590163934426 f1: 0.8455284552845529


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1937.32it/s] 
Registered model 'logistic_regression_model' already exists. Creating a new version of this model...


artifact uri: file:///d:/mlflow-test/mlruns/442757179411113851/1088c97dae594f39b88d133e5a6b5683/artifacts/sk_models


Created version '6' of model 'logistic_regression_model'.
