In [6]:
from pprint import pprint

import lightgbm as lgb
import numpy as np
import optuna.integration.lightgbm as olgb
import optuna.logging
import pandas as pd
import psutil

from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold

optuna.logging.set_verbosity(optuna.logging.WARNING)

In [7]:
# 特徴量データの読み込み
features_and_label: pd.DataFrame = pd.read_csv("./feature.csv").drop(columns=["domain"])

# featuresとlabelを分割
features = features_and_label.drop("label", axis=1)
labels = features_and_label[["label"]]

# LightGBMの形式に変換
train_data = lgb.Dataset(
    data=features,
    label=labels,
    feature_name=features.columns.tolist(),
)

In [8]:
# Stratified 5-fold CV

params = {
    "num_threads": psutil.cpu_count() - 1,  # スレッド数
    "objective": "binary",  # 二値分類
    "metric": "binary_logloss",
    "boosting_type": "gbdt",
    "verbosity": -1,  # 学習途中の情報を表示しない
    "is_unbalance": True,
    "seed": 42,
}

cv_scores_columns = ["accuracy", "TPR", "TNR", "FPR", "FNR", "F1", "AUC"]
history = np.empty((0, 7))

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, valid_idx in kf.split(features, labels):
    x_train_fold = features.iloc[train_idx, :]  # 学習用特徴量
    y_train_fold = labels.iloc[train_idx, :]  # 学習用ラベル
    x_valid_fold = features.iloc[valid_idx, :]  # テスト用特徴量
    y_valid_fold = labels.iloc[valid_idx, :]  # テスト用ラベル

    train_weight = np.where(y_train_fold == 1, 10, 1)

    dtrain = lgb.Dataset(
        x_train_fold,
        label=y_train_fold,
        feature_name=features.columns.tolist(),
    )
    dvalid = lgb.Dataset(
        x_valid_fold,
        label=y_valid_fold,
        feature_name=features.columns.tolist(),
    )

    # 訓練
    model = olgb.train(
        params=params,
        train_set=dtrain,
        valid_sets=[dtrain, dvalid],
        callbacks=[lgb.early_stopping(100, verbose=False)],
        num_boost_round=10000,
    )

    # 予測
    label_pred_prob = model.predict(x_valid_fold)
    label_pred = (label_pred_prob > 0.5).astype(int)

    # 評価指標の計算
    tn, fp, fn, tp = confusion_matrix(y_valid_fold, label_pred).flatten()
    acc = accuracy_score(y_valid_fold, label_pred)
    tpr = tp / (tp + fn)  # recall
    tnr = tn / (tn + fp)  # specificity
    fpr = fp / (tn + fp)
    fnr = fn / (tp + fn)
    f1 = f1_score(y_valid_fold, label_pred)
    auc = roc_auc_score(y_valid_fold, label_pred)

    # 評価結果の格納
    history = np.vstack([history, [acc, tpr, tnr, fpr, fnr, f1, auc]])

feature_fraction, val_score: 0.035828: 100%|##########| 7/7 [00:00<00:00, 11.38it/s]
num_leaves, val_score: 0.035828: 100%|##########| 20/20 [00:02<00:00,  9.44it/s]
bagging, val_score: 0.031045: 100%|##########| 10/10 [00:01<00:00,  9.36it/s]
feature_fraction_stage2, val_score: 0.031045: 100%|##########| 6/6 [00:00<00:00,  9.84it/s]
regularization_factors, val_score: 0.029306: 100%|##########| 20/20 [00:01<00:00, 11.20it/s]
min_child_samples, val_score: 0.029306: 100%|##########| 5/5 [00:00<00:00, 14.04it/s]
feature_fraction, val_score: 0.044075: 100%|##########| 7/7 [00:00<00:00, 14.93it/s]
num_leaves, val_score: 0.044075: 100%|##########| 20/20 [00:01<00:00, 12.76it/s]
bagging, val_score: 0.044075: 100%|##########| 10/10 [00:00<00:00, 13.81it/s]
feature_fraction_stage2, val_score: 0.044075: 100%|##########| 6/6 [00:00<00:00, 14.93it/s]
regularization_factors, val_score: 0.042486: 100%|##########| 20/20 [00:01<00:00, 14.95it/s]
min_child_samples, val_score: 0.040421: 100%|##########|

In [11]:
mean = np.mean(history, axis=0)
history = np.vstack([history, mean])

df_cv_scores = pd.DataFrame(history, columns=cv_scores_columns)
df_cv_scores

Unnamed: 0,accuracy,TPR,TNR,FPR,FNR,F1,AUC
0,0.990909,0.996403,0.961538,0.038462,0.003597,0.994614,0.978971
1,0.984848,0.992806,0.942308,0.057692,0.007194,0.991023,0.967557
2,0.990909,0.996403,0.961538,0.038462,0.003597,0.994614,0.978971
3,0.987842,0.992806,0.960784,0.039216,0.007194,0.992806,0.976795
4,0.99696,0.996403,1.0,0.0,0.003597,0.998198,0.998201
5,0.990294,0.994964,0.965234,0.034766,0.005036,0.994251,0.980099
6,0.990294,0.994964,0.965234,0.034766,0.005036,0.994251,0.980099


In [10]:
# 訓練
model = olgb.train(
    params=params,
    train_set=train_data,
    valid_sets=[train_data],
    callbacks=[lgb.early_stopping(100, verbose=False)],
    num_boost_round=10000,
)

importance = pd.DataFrame(model.feature_importance(), index=features.columns.tolist(), columns=['importance']).sort_values(by='importance', ascending=False)
importance

feature_fraction, val_score: 0.000001: 100%|##########| 7/7 [00:09<00:00,  1.39s/it]
num_leaves, val_score: 0.000001: 100%|##########| 20/20 [00:27<00:00,  1.36s/it]
bagging, val_score: 0.000001: 100%|##########| 10/10 [00:02<00:00,  3.49it/s]
feature_fraction_stage2, val_score: 0.000001: 100%|##########| 3/3 [00:03<00:00,  1.28s/it]
regularization_factors, val_score: 0.000001: 100%|##########| 20/20 [00:21<00:00,  1.08s/it]
min_child_samples, val_score: 0.000001: 100%|##########| 5/5 [00:06<00:00,  1.22s/it]


Unnamed: 0,importance
tag_count_in_head_tag,6697
external_link_percentage,6271
script_tag_count,3232
same_page_link_count,1627
no_domain_in_internal_link,865
iframe_tag_count,274
invalid_kiyaku,99
google_analytics,93
no_title,1
copied,0
