In [1]:
import lightgbm as lgb
import numpy as np
import optuna.integration.lightgbm as olgb
import optuna.logging
import pandas as pd
import psutil

from onnxconverter_common import FloatTensorType
from onnxmltools.convert import convert_lightgbm
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold, train_test_split

optuna.logging.set_verbosity(optuna.logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 特徴量データの読み込み
features_and_label: pd.DataFrame = pd.read_csv("../feature.csv").drop(columns=["domain"])

# featuresとlabelを分割
features = features_and_label.drop("label", axis=1)
labels = features_and_label[["label"]]

# LightGBMの形式に変換
train_data = lgb.Dataset(
    data=features,
    label=labels,
    feature_name=features.columns.tolist(),
)

In [3]:
params = {
    "num_threads": psutil.cpu_count() - 1,  # スレッド数
    "objective": "binary",  # 二値分類
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "verbosity": -1,  # 学習途中の情報を表示しない
    "is_unbalance": True,
    "seed": 42,
}

In [4]:
# Stratified 5-fold CV

cv_scores_columns = ["accuracy", "TPR", "TNR", "FPR", "FNR", "F1", "AUC", "TP", "TN", "FP", "FN"]
history = np.empty((0, 11))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, test_idx in kf.split(features, labels):
    x_train = features.iloc[train_idx, :]  # 学習用特徴量
    y_train = labels.iloc[train_idx, :]  # 学習用ラベル
    x_test = features.iloc[test_idx, :]  # テスト用特徴量
    y_test = labels.iloc[test_idx, :]  # テスト用ラベル

    dtrain = lgb.Dataset(x_train, label=y_train, feature_name=features.columns.tolist())
    dtest = lgb.Dataset(x_test, label=y_test, feature_name=features.columns.tolist())

    # 訓練
    model = olgb.train(
        params=params,
        train_set=dtrain,
        valid_sets=[dtrain, dtest],
        callbacks=[lgb.early_stopping(100, verbose=False)],
        num_boost_round=10000,
    )

    # 予測
    label_pred_prob = model.predict(x_test)
    label_pred = (label_pred_prob > 0.5).astype(int)

    # 評価指標の計算
    tn, fp, fn, tp = confusion_matrix(y_test, label_pred).flatten()
    acc = accuracy_score(y_test, label_pred)
    tpr = tp / (tp + fn)  # recall
    tnr = tn / (tn + fp)  # specificity
    fpr = fp / (tn + fp)
    fnr = fn / (tp + fn)
    f1 = f1_score(y_test, label_pred)
    auc = roc_auc_score(y_test, label_pred_prob)

    # 評価結果の格納
    history = np.vstack([history, [acc, tpr, tnr, fpr, fnr, f1, auc, tp, tn, fp, fn]])

feature_fraction, val_score: 0.027720: 100%|##########| 7/7 [00:00<00:00, 13.19it/s]
num_leaves, val_score: 0.026732: 100%|##########| 20/20 [00:02<00:00,  8.77it/s]
bagging, val_score: 0.023424: 100%|##########| 10/10 [00:01<00:00,  9.25it/s]
feature_fraction_stage2, val_score: 0.023424: 100%|##########| 3/3 [00:00<00:00, 11.79it/s]
regularization_factors, val_score: 0.023422: 100%|##########| 20/20 [00:01<00:00, 10.83it/s]
min_child_samples, val_score: 0.023265: 100%|##########| 5/5 [00:00<00:00, 11.06it/s]
feature_fraction, val_score: 0.083877: 100%|##########| 7/7 [00:00<00:00, 16.88it/s]
num_leaves, val_score: 0.083875: 100%|##########| 20/20 [00:01<00:00, 12.93it/s]
bagging, val_score: 0.076872: 100%|##########| 10/10 [00:00<00:00, 13.02it/s]
feature_fraction_stage2, val_score: 0.076872: 100%|##########| 3/3 [00:00<00:00, 13.57it/s]
regularization_factors, val_score: 0.075247: 100%|##########| 20/20 [00:01<00:00, 16.87it/s]
min_child_samples, val_score: 0.075247: 100%|##########|

In [9]:
mean = np.mean(history, axis=0)
history_final = np.vstack([history, mean])

df_cv_scores = pd.DataFrame(history_final, columns=cv_scores_columns, index=["1", "2", "3", "4", "5", "平均"])
df_cv_scores

Unnamed: 0,accuracy,TPR,TNR,FPR,FNR,F1,AUC,TP,TN,FP,FN
1,0.993958,0.996403,0.981132,0.018868,0.003597,0.996403,0.999186,277.0,52.0,1.0,1.0
2,0.975831,0.985612,0.924528,0.075472,0.014388,0.985612,0.977942,274.0,49.0,4.0,4.0
3,0.996979,1.0,0.981132,0.018868,0.0,0.998205,0.999932,278.0,52.0,1.0,0.0
4,0.984894,0.992806,0.943396,0.056604,0.007194,0.991023,0.998507,276.0,50.0,3.0,2.0
5,0.996979,0.996403,1.0,0.0,0.003597,0.998198,0.999932,277.0,53.0,0.0,1.0
平均,0.989728,0.994245,0.966038,0.033962,0.005755,0.993888,0.9951,276.4,51.2,1.8,1.6


In [6]:
# 訓練
x_train, x_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.2, stratify=labels)

dtrain = lgb.Dataset(x_train, label=y_train, feature_name=features.columns.tolist())
dvalid = lgb.Dataset(x_valid, label=y_valid, feature_name=features.columns.tolist())

# 訓練
model = olgb.train(
    params=params,
    train_set=dtrain,
    valid_sets=[dtrain, dvalid],
    callbacks=[lgb.early_stopping(100, verbose=False)],
    num_boost_round=10000,
)

importance = pd.DataFrame(
    model.feature_importance(),
    index=features.columns.tolist(),
    columns=['importance']
).sort_values(by='importance', ascending=False)

importance

feature_fraction, val_score: 0.021545: 100%|##########| 7/7 [00:00<00:00,  9.41it/s]
num_leaves, val_score: 0.021545: 100%|##########| 20/20 [00:02<00:00,  6.78it/s]
bagging, val_score: 0.021545: 100%|##########| 10/10 [00:01<00:00,  8.34it/s]
feature_fraction_stage2, val_score: 0.021545: 100%|##########| 3/3 [00:00<00:00,  9.56it/s]
regularization_factors, val_score: 0.021545: 100%|##########| 20/20 [00:02<00:00,  7.40it/s]
min_child_samples, val_score: 0.021545: 100%|##########| 5/5 [00:00<00:00,  8.96it/s]


Unnamed: 0,importance
TagCountInHeadTag,2034
externalLinkPercentage,1287
scriptTagCount,431
samePageLinkCount,418
noDomainInInternalLink,293
googleAnalytics,89
iframeTagCount,84
invalidKiyaku,61
noTitle,19
copy,9


In [7]:
# ONNXへの変換
initial_type = [
    ("X", FloatTensorType([None, 11])),
]
onx = convert_lightgbm(model=model, initial_types=initial_type, zipmap=False)
with open("../onnx_model/lightgbm.onnx", "wb") as f:
    f.write(onx.SerializeToString())

The maximum opset needed by this model is only 9.


In [8]:
import onnx
onnx_model = onnx.load("../onnx_model/lightgbm.onnx")
onnx.checker.check_model(onnx_model)