In [35]:
import numpy as np
import optuna.logging
import pandas as pd
import xgboost as xgb

from onnxconverter_common import FloatTensorType
from onnxmltools.convert import convert_xgboost
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold, train_test_split

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [36]:
# 特徴量データの読み込み
features_and_label: pd.DataFrame = pd.read_csv("../feature.csv").drop(columns=["domain"])

# featuresとlabelを分割
features = features_and_label.drop("label", axis=1)
labels = features_and_label[["label"]]

In [37]:
params = {
    "objective": "binary:logistic",  # 二値分類
    "eval_metric": "logloss",
    "booster": "gbtree",
    "verbosity": 0,  # 学習途中の情報を表示しない
    "seed": 42,
}

In [38]:
# Stratified 5-fold CV

cv_scores_columns = ["accuracy", "TPR", "TNR", "FPR", "FNR", "F1", "AUC", "TP", "TN", "FP", "FN"]
history = np.empty((0, 11))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, test_idx in kf.split(features, labels):
    _x_train: pd.DataFrame = features.iloc[train_idx, :]  # 学習用特徴量
    _y_train: pd.DataFrame = labels.iloc[train_idx, 0]  # 学習用ラベル
    x_test: pd.DataFrame = features.iloc[test_idx, :]  # テスト用特徴量
    y_test: pd.DataFrame = labels.iloc[test_idx, 0]  # テスト用ラベル
    x_train, x_valid, y_train, y_valid = train_test_split(_x_train, _y_train, test_size=0.25, stratify=_y_train,
                                                          random_state=42)

    dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=features.columns.tolist())
    dtest = xgb.DMatrix(x_test, label=y_test, feature_names=features.columns.tolist())
    dvalid = xgb.DMatrix(x_valid, label=y_valid, feature_names=features.columns.tolist())

    # 訓練
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        evals=[(dvalid, 'valid'), (dtrain, 'train')],
        early_stopping_rounds=100,
        verbose_eval=False,
        num_boost_round=10000,
    )

    # 予測
    label_pred_prob = model.predict(dtest)
    label_pred = (label_pred_prob > 0.5).astype(int)

    # 評価指標の計算
    tn, fp, fn, tp = confusion_matrix(y_test, label_pred).flatten()
    acc = accuracy_score(y_test, label_pred)
    tpr = tp / (tp + fn)  # recall
    tnr = tn / (tn + fp)  # specificity
    fpr = fp / (tn + fp)
    fnr = fn / (tp + fn)
    f1 = f1_score(y_test, label_pred)
    auc = roc_auc_score(y_test, label_pred_prob)

    # 評価結果の格納
    history = np.vstack([history, [acc, tpr, tnr, fpr, fnr, f1, auc, tp, tn, fp, fn]])

In [39]:
mean = np.mean(history, axis=0)
history_final = np.vstack([history, mean])

df_cv_scores = pd.DataFrame(history_final, columns=cv_scores_columns, index=["1", "2", "3", "4", "5", "平均"])
df_cv_scores

Unnamed: 0,accuracy,TPR,TNR,FPR,FNR,F1,AUC,TP,TN,FP,FN
1,0.979899,0.98995,0.969849,0.030151,0.01005,0.9801,0.99303,197.0,193.0,6.0,2.0
2,0.987437,0.994975,0.979899,0.020101,0.005025,0.987531,0.995379,198.0,195.0,4.0,1.0
3,0.987437,0.984925,0.98995,0.01005,0.015075,0.987406,0.998384,196.0,197.0,2.0,3.0
4,0.967337,0.969849,0.964824,0.035176,0.030151,0.967419,0.992601,193.0,192.0,7.0,6.0
5,0.982412,0.98995,0.974874,0.025126,0.01005,0.982544,0.999217,197.0,194.0,5.0,2.0
平均,0.980905,0.98593,0.975879,0.024121,0.01407,0.981,0.995722,196.2,194.2,4.8,2.8


In [42]:
# 訓練
x_train, x_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)

dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=features.columns.tolist())
dvalid = xgb.DMatrix(x_valid, label=y_valid, feature_names=features.columns.tolist())

# 訓練
model = xgb.train(
    params=params,
    dtrain=dtrain,
    evals=[(dvalid, 'valid'), (dtrain, 'train')],
    early_stopping_rounds=100,
    verbose_eval=False,
    num_boost_round=10000,
)

importance = pd.DataFrame(
    model.get_fscore().values(),
    index=model.get_fscore().keys(),
    columns=['importance']
).sort_values(by='importance', ascending=False)

importance

Unnamed: 0,importance
externalLinkPercentage,7424.0
TagCountInHeadTag,6717.0
scriptTagCount,2950.0
noDomainInInternalLink,372.0
samePageLinkCount,177.0
iframeTagCount,36.0
googleAnalytics,23.0
noTitle,2.0


In [43]:
# ONNXへの変換
initial_type = [
    ("X", FloatTensorType([None, 11])),
]

model_copied = model.copy()
model_copied.feature_names = [f"{num}" for num in range(len(model.feature_names))]

onx = convert_xgboost(model=model_copied, initial_types=initial_type)
with open("../onnx_model/xgb.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [44]:
import onnx

onnx_model = onnx.load("../onnx_model/xgb.onnx")
onnx.checker.check_model(onnx_model)