In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import sklearn
from sklearn import metrics
import os
from types import SimpleNamespace
import time
import imblearn



In [2]:
BasicFeatures = [
    'QTD', 'BTD', 'TTD', 'MTTD', 'QTMTD',
    'currIntraMode', 
    'mrlIdx', 'ispMode', 'mtsFlag', 'lfnstIdx', 'mipFlag', 'mipTransposedFlag',  # 
    'isModeVer', 'intraPredAngleMode',  # 
    'currIntraFracBits', 'currIntraDistortion', 'currIntraCost',  # 
    'bestPredModeDCT2', 
    'mean', 'stddev', 'diffStdDevVer', 'diffStdDevHor', 'Gx', 'Gy', 'ratioGxGy', 'normGradient',  # 
    'entropy', 'skewness', 'kurtosis', 'pixelSum',  # 
    
    'BH_Above_mean', 'BH_Above_stddev', 'BH_Above_Gx', 'BH_Above_Gy', 'BH_Above_ratioGxGy', 'BH_Above_normGradient', 
    'BH_Below_mean', 'BH_Below_stddev', 'BH_Below_Gx', 'BH_Below_Gy', 'BH_Below_ratioGxGy', 'BH_Below_normGradient', 
    'BV_Left_mean', 'BV_Left_stddev', 'BV_Left_Gx', 'BV_Left_Gy', 'BV_Left_ratioGxGy', 'BV_Left_normGradient', 
    'BV_Right_mean', 'BV_Right_stddev', 'BV_Right_Gx', 'BV_Right_Gy', 'BV_Right_ratioGxGy', 'BV_Right_normGradient', 
    'TH_Above_mean', 'TH_Above_stddev', 'TH_Above_Gx', 'TH_Above_Gy', 'TH_Above_ratioGxGy', 'TH_Above_normGradient',
    'TH_Middle_mean', 'TH_Middle_stddev', 'TH_Middle_Gx', 'TH_Middle_Gy', 'TH_Middle_ratioGxGy', 'TH_Middle_normGradient', 
    'TH_Below_mean', 'TH_Below_stddev', 'TH_Below_Gx', 'TH_Below_Gy', 'TH_Below_ratioGxGy', 'TH_Below_normGradient', 
    'TV_Left_mean', 'TV_Left_stddev', 'TV_Left_Gx', 'TV_Left_Gy', 'TV_Left_ratioGxGy', 'TV_Left_normGradient',
    'TV_Middle_mean', 'TV_Middle_stddev', 'TV_Middle_Gx', 'TV_Middle_Gy', 'TV_Middle_ratioGxGy', 'TV_Middle_normGradient',
    'TV_Right_mean', 'TV_Right_stddev', 'TV_Right_Gx', 'TV_Right_Gy', 'TV_Right_ratioGxGy', 'TV_Right_normGradient', 
    
    'neighAvgQT', 'neighHigherQT', 'neighAvgMTT', 'neighHigherMTT', 'neighAvgHorNum', 'neighAvgVerNum',  # 
]

In [3]:
# 将array-like 的class_weights 转换为字典
def convert_class_weight(class_weights):
    n_classes = len(class_weights)
    result = dict()
    for i in range(n_classes):
        result[i] = class_weights[i]
    return result

# 标签转换，将[0, 2, 3, 4, 5] 转换为 [0, 1, 2, 3, 4]
def convert_label(df: pd.DataFrame, shape: str, use_down_sample=False):
    df['label'] = df['splitMode']
    if shape in ['32x16', '16x32']:
        df.loc[df['splitMode'] != 0, 'label'] -= 1
    elif shape == '32x8' or shape == '16x8':  # 无模式 1、4
        df.loc[df['splitMode'].isin([2, 3]), 'label'] -= 1
        df.loc[df['splitMode'] == 5, 'label'] -= 2
    elif shape == '8x32' or shape == '8x16':  # 无模式 1、5
        df.loc[df['splitMode'] != 0, 'label'] -= 1
    elif shape == '16x16': # 去除QT模式的样本,  and 
        df = df.loc[df['splitMode'] != 1]
        df.loc[df['splitMode'] != 0, 'label'] -= 1
    elif shape == '8x8':  # 无模式 1, 4, 5
        df = df.loc[df['splitMode'] != 1]
        df.loc[df['splitMode'] != 0, 'label'] -= 1
        
    return df

# 读取 parquet 版本的数据
def load_data_v3(store_dir: str, qp: str, shape: str, val_ratio=0.1):
    store_path = os.path.join(store_dir, shape, f'QP{qp}_{shape}.parquet.gzip')
    df = pd.read_parquet(store_path)
    df.reset_index(inplace=True)

    y = df['label']
    X = df[BasicFeatures]
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=val_ratio, shuffle=True, stratify=y)
    return X_train, X_test, y_train, y_test
    

# LGBM 模型训练
def train(pkl_dir: str, qp: str, shape: str, params: SimpleNamespace, class_weight: dict, save_model: bool, save_dir: str, early_stop_rounds=20, use_down_sample=False):
    # 加载数据
    X_train, X_test, y_train, y_test = load_data_v3(pkl_dir, qp, shape)

    # 获取权重
    train_class_weights = sklearn.utils.class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    train_class_weights = convert_class_weight(train_class_weights)
    print(train_class_weights)

    # 构造模型
    lgb_classifier = lgb.LGBMClassifier(
        boosting_type='gbdt',
        num_leaves=params.num_leaves,
        max_depth=params.max_depth,
        learning_rate=params.learning_rate,
        n_estimators=params.n_estimators,
        objective='multiclass',
        class_weight=train_class_weights,
        subsample=params.subsample,
        subsample_freq=params.subsample_freq,
        num_threads=36,
    )
    # 拟合数据
    lgb_classifier.fit(
        X=X_train, 
        y=y_train, 
        eval_set=[(X_test, y_test)],
        eval_metric='multi_error',
        callbacks=[lgb.early_stopping(early_stop_rounds)]  # 早停机制
    )
    # 计算指标
    y_score = lgb_classifier.predict_proba(X_test)
    y_pred = y_score.argmax(axis=1)
    report = metrics.classification_report(y_pred=y_pred, y_true=y_test, digits=4)
    top2_accuracy = metrics.top_k_accuracy_score(y_score=y_score, y_true=y_test, k=2)  # labels=[0, 1, 2, 3, 4, 5]
    print(f"Top2 Accuracy: {top2_accuracy}\nClassification_repost:\n{report}")
    # 存储模型
    if save_model:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        save_path = os.path.join(save_dir, f"QP{qp}_{shape}.txt")
        lgb_classifier.booster_.save_model(save_path)

In [None]:
params = SimpleNamespace()
params.num_leaves = 95
params.max_depth = -1
params.learning_rate = 0.1  # 
params.n_estimators = 10000
params.subsample = 0.9
params.subsample_freq = 50
params.early_stop_rounds = 5

pkl_dir = "parquets_lgbm"
save_dir = "scripts/lgbm_scripts"
save_dir = os.path.join(save_dir, '0107-0')  # time.strftime('%m%d')

QPs = ['37', '32', '27', '22']  # '37', '32', '27', '22'
Shapes = ['32x32', '32x16', '16x32', '8x32', '32x8', '16x16']  # '32x32', '32x16', '16x32', '8x32', '32x8', '16x16', '8x16', '16x8', '8x8'

for shape in Shapes:
    for qp in QPs:
        # load_data(pkl_dir, qp, shape)
        print(f"Starting training model: QP{qp}_{shape} ....")
        class_weight = dict()  # get_class_weight(qp, shape, class_weights)

        train(
            pkl_dir=pkl_dir,
            qp=qp,
            shape=shape,
            params=params,
            class_weight=class_weight,
            save_model=True,
            save_dir=save_dir,
            early_stop_rounds=params.early_stop_rounds,
            use_down_sample=False,
        )

Starting training model: QP37_32x32 ....
{0: 0.46459204719751657, 1: 0.8777857451937937, 2: 0.8214249353269748, 3: 1.3019441225226283, 4: 2.1126084576107784, 5: 4.007750231434442}
