In [18]:
import numpy as np
import optuna.logging
import pandas as pd

from onnxconverter_common import FloatTensorType
from onnxmltools.convert import convert_sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold, train_test_split

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [19]:
# 特徴量データの読み込み
features_and_label: pd.DataFrame = pd.read_csv("../feature.csv").drop(columns=["domain"])

# featuresとlabelを分割
features = features_and_label.drop("label", axis=1)
labels = features_and_label[["label"]]

In [20]:
# Stratified 5-fold CV

cv_scores_columns = ["accuracy", "TPR", "TNR", "FPR", "FNR", "F1", "AUC", "TP", "TN", "FP", "FN"]
history = np.empty((0, 11))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, test_idx in kf.split(features, labels):
    x_train: pd.DataFrame = features.iloc[train_idx, :]  # 学習用特徴量
    y_train: pd.DataFrame = labels.iloc[train_idx, 0]  # 学習用ラベル
    x_test: pd.DataFrame = features.iloc[test_idx, :]  # テスト用特徴量
    y_test: pd.DataFrame = labels.iloc[test_idx, 0]  # テスト用ラベル

    # 訓練
    model = LogisticRegression(max_iter=2000)
    model.fit(x_train, y_train)

    # 予測
    label_pred_prob = model.predict_proba(x_test)[:, 1]
    label_pred = (label_pred_prob > 0.5).astype(int)

    # 評価指標の計算
    tn, fp, fn, tp = confusion_matrix(y_test, label_pred).flatten()
    acc = accuracy_score(y_test, label_pred)
    tpr = tp / (tp + fn)  # recall
    tnr = tn / (tn + fp)  # specificity
    fpr = fp / (tn + fp)
    fnr = fn / (tp + fn)
    f1 = f1_score(y_test, label_pred)
    auc = roc_auc_score(y_test, label_pred_prob)

    # 評価結果の格納
    history = np.vstack([history, [acc, tpr, tnr, fpr, fnr, f1, auc, tp, tn, fp, fn]])

In [21]:
mean = np.mean(history, axis=0)
history_final = np.vstack([history, mean])

df_cv_scores = pd.DataFrame(history_final, columns=cv_scores_columns, index=["1", "2", "3", "4", "5", "平均"])
df_cv_scores

Unnamed: 0,accuracy,TPR,TNR,FPR,FNR,F1,AUC,TP,TN,FP,FN
1,0.932161,0.919598,0.944724,0.055276,0.080402,0.931298,0.973208,183.0,188.0,11.0,16.0
2,0.917085,0.919598,0.914573,0.085427,0.080402,0.917293,0.973511,183.0,182.0,17.0,16.0
3,0.924623,0.944724,0.904523,0.095477,0.055276,0.926108,0.975758,188.0,180.0,19.0,11.0
4,0.932161,0.944724,0.919598,0.080402,0.055276,0.933002,0.975329,188.0,183.0,16.0,11.0
5,0.942211,0.959799,0.924623,0.075377,0.040201,0.94321,0.982627,191.0,184.0,15.0,8.0
平均,0.929648,0.937688,0.921608,0.078392,0.062312,0.930182,0.976086,186.6,183.4,15.6,12.4


In [22]:
# 訓練
x_train, x_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)

# 訓練
model = LogisticRegression(max_iter=2000)
model.fit(x_train, y_train.iloc[:, 0])

In [23]:
# ONNXへの変換
initial_type = [
    ("X", FloatTensorType([None, 11])),
]
onx = convert_sklearn(model=model, initial_types=initial_type)
with open("../onnx_model/lr.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [24]:
import onnx
onnx_model = onnx.load("../onnx_model/lr.onnx")
onnx.checker.check_model(onnx_model)