In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,f1_score   

from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import GroupKFold

In [3]:
import numpy as np
import pandas as pd

def add_bp_class(df, sbp_col="sbp", dbp_col="dbp", new_col="bp_class"):
    """
    Add blood pressure class column based on SBP and DBP values.
    
    Categories:
        0: sbp < 120 and dbp < 80
        1: 120 <= sbp < 140 or 80 <= dbp < 90
        2: sbp >= 140 or dbp >= 90
    """
    
    conditions = [
        (df[sbp_col] < 120) & (df[dbp_col] < 80),
        ((df[sbp_col] >= 120) & (df[sbp_col] < 140)) | ((df[dbp_col] >= 80) & (df[dbp_col] < 90)),
        (df[sbp_col] >= 140) | (df[dbp_col] >= 90)
    ]
    
    values = [0, 1, 2]
    
    df[new_col] = np.select(conditions, values, default=np.nan).astype(int)
    return df


def add_sbp_class(df, col_name='sbp', new_col='sbp_class'):
    """
    SBP値に応じてクラスを付与する関数
      - sbp < 100 → 0
      - 100 <= sbp < 140 → 1
      - sbp >= 140 → 2

    Parameters
    ----------
    df : pd.DataFrame
        SBP列を含むデータフレーム
    col_name : str, default 'sbp'
        SBPの列名
    new_col : str, default 'sbp_class'
        新しい列名

    Returns
    -------
    df : pd.DataFrame
        新しい列が追加されたDataFrame
    """
    conditions = [
        df[col_name] < 100,
        (df[col_name] >= 100) & (df[col_name] < 140),
        df[col_name] >= 140
    ]
    values = [0, 1, 2]
    df[new_col] = np.select(conditions, values).astype('int')
    return df

def add_sbp_class_2(df, col_name='sbp',boundary=120, new_col='sbp_class'):
    """
    SBP値に応じてクラスを付与する関数
      - sbp < boundary → 0
      - sbp >= boundary → 2

    Parameters
    ----------
    df : pd.DataFrame
        SBP列を含むデータフレーム
    col_name : str, default 'sbp'
        SBPの列名
    new_col : str, default 'sbp_class'
        新しい列名

    Returns
    -------
    df : pd.DataFrame
        新しい列が追加されたDataFrame
    """
    conditions = [
        df[col_name] < boundary,
        df[col_name] >= boundary
    ]
    values = [0, 2]
    df[new_col] = np.select(conditions, values).astype('int')
    return df
from sklearn.model_selection import StratifiedGroupKFold
def get_groupkf(df,k=5, group_col='subject',y_col='sbp_class', random_state=42,get_holdout=False):
    """
    StratifiedGroupKFoldを用いてデータを分割する関数
    Parameters
    ----------
    df : pd.DataFrame
        分割対象のデータフレーム
    k : int, default 5
        分割数
    group_col : str, default 'subject'
        グループ化に使用する列名
    y_col : str, default 'sbp_class'
        ラベル列名 
    random_state : int, default 42
        乱数シード
    get_holdout : bool, default False
        ホールドアウトセットを取得するかどうか
    Returns
    -------
    folds : list of tuples
        各フォールドのデータセットのリスト
        get_holdoutがTrueの場合、各タプルは(train_data, val_data, hold_data)となる
        get_holdoutがFalseの場合、各タプルは(train_data, hold_data)となる
    """
    gkf = StratifiedGroupKFold(n_splits=k, shuffle=True, random_state=random_state)
    gkf2 = StratifiedGroupKFold(n_splits=k, shuffle=True, random_state=random_state)
    folds = []
    for use_idx, hold_idx in gkf.split(df,df[y_col], groups=df[group_col]):
        u = df.iloc[use_idx]
        h = df.iloc[hold_idx]
        if get_holdout:
            for train_idx, val_idx in gkf2.split(u,u[y_col], groups=u[group_col]):
                t = u.iloc[train_idx]
                v = u.iloc[val_idx]
                break
            print(set(t[group_col]) & set(v[group_col])& set(h[group_col]))
            train_data = get_dataset(t, weight='balanced',y_col=y_col)
            val_data = get_dataset(v,return_raw=True)
        else:
            print(set(u[group_col]) & set(h[group_col]))
            train_data = get_dataset(u, weight='balanced',y_col=y_col)
        hold_data = get_dataset(h,return_raw=True,y_col=y_col)
        if get_holdout:
            folds.append((train_data, val_data, hold_data))
        else:
            folds.append((train_data, hold_data))
    return folds
def get_dataset(df,weight=None,return_raw=False,y_col='sbp_class'):
    """
    Daframeから不要な列を除いてLightGBM用のデータセットを作成する関数
    Parameters
    ----------
    df : pd.DataFrame
        特徴量とラベルを含むデータフレーム

    weight : str or None, default None
        クラス重みの指定。Noneの場合は重みなし、'balanced'の場合はバランス調整された重みを使用。
    Returns
    -------
    dataset : lgb.Dataset
        LightGBM用のデータセット
    """ 
    X = df.drop(['signal_index', 'sbp', 'subject', 'dbp','sbp_class','sbp_bin',"bp_class"],axis=1, errors='ignore')
    Y = df[y_col]
    if weight is not None:
        weights = compute_sample_weight(class_weight=weight, y=Y)
        dataset = lgb.Dataset(X, label=Y, weight=weights)
        return dataset
    dataset = lgb.Dataset(X, label=Y)
    if return_raw:
        return [X, Y] 
    return dataset
    
def add_class(classes=3, boundary=120):
    train_df = pd.read_parquet(r'F:\minowa\BloodPressureEstimation\data\processed\PulseDB\Downsampled\train_features.parquet')
    test_df = pd.read_parquet(r'F:\minowa\BloodPressureEstimation\data\processed\PulseDB\Downsampled\test_features.parquet' ).drop('Unnamed: 0', axis=1, errors='ignore')  
    train_df=train_df.drop('Unnamed: 0', axis=1, errors='ignore')
    test_df=test_df.drop('Unnamed: 0', axis=1, errors='ignore')  
    if boundary is None:
        # print("1",boundary)
        train_df = add_sbp_class(train_df)
        test_df = add_sbp_class(test_df)
        print(test_df["sbp_class"].nunique())
    else:
        # print("2",boundary,classes)
        train_df = add_sbp_class_2(train_df, boundary=boundary)
        test_df = add_sbp_class_2(test_df, boundary=boundary)
    if classes==2:
        # print("2",boundary,classes)
        train_df = train_df[train_df['sbp_class'] != 1]
        test_df = test_df[test_df['sbp_class'] != 1]
        train_df['sbp_class'] = train_df['sbp_class'] // 2 
        test_df['sbp_class'] = test_df['sbp_class'] // 2
    return train_df, test_df

In [4]:
train_df = pd.read_parquet(r'F:\minowa\BloodPressureEstimation\data\processed\PulseDB\Downsampled\train_features.parquet')
test_df = pd.read_parquet(r'F:\minowa\BloodPressureEstimation\data\processed\PulseDB\Downsampled\test_features.parquet' ).drop('Unnamed: 0', axis=1, errors='ignore')  
train_df=train_df.drop('Unnamed: 0', axis=1, errors='ignore')
test_df=test_df.drop('Unnamed: 0', axis=1, errors='ignore')  
train_df.shape, test_df.shape

((448377, 103), (54892, 103))

In [6]:
train_df, test_df = add_class(classes=2, boundary=120)

In [9]:

train_Y = train_df['sbp_class']
train_X = train_df.drop(['signal_index', 'sbp', 'subject', 'dbp','sbp_class','sbp_bin'],axis=1)
test_Y = test_df['sbp_class']
test_X = test_df.drop(['signal_index', 'sbp', 'subject', 'dbp','sbp_class','sbp_bin'],axis=1)
train_X.shape, train_Y.shape

((448377, 98), (448377,))

In [43]:

train_df, test_df = add_class(classes=2, boundary=None)
folds = get_groupkf(train_df,k=5, group_col='subject',y_col='sbp_class', random_state=42,get_holdout=True)
test_X, test_Y = get_dataset(test_df,return_raw=True)
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    "min_data_in_leaf": 200,
    'num_leaves': 31,
    'max_depth': -1,
    'verbose': -1,
    'random_state': 42,
    'device':'gpu'
}
importance_df = pd.DataFrame({"feature": test_X.columns})
results = []
if test_Y.ndim>1:
    # softlabel
    params['objective'] = 'cross_entropy'
    params['metric'] = 'cross_entropy'
    params['num_class'] =1
    params['objective'] = 'multiclass'
    params['metric'] = 'multi_error'
    params['num_class'] = test_X.get_label().nunique()
# --- モデル学習 ---
for f in range(5):
    lgb_train,_val,_hold = folds[f]
    val_X, val_Y = _val
    hold_X, hold_Y = _hold
    lgb_val = lgb.Dataset(val_X, label=val_Y) 
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train', 'val'],
        num_boost_round=2000,
        callbacks=[lgb.early_stopping(stopping_rounds=50),lgb.log_evaluation(period=200)],
    )
    # --- 予測 ---
    pred_Y_prob = model.predict(test_X, num_iteration=model.best_iteration)
    pred_Y_prob_val = model.predict(val_X, num_iteration=model.best_iteration)
    pred_Y_prob_hold = model.predict(hold_X, num_iteration=model.best_iteration)
    pred_Y_test = np.argmax(pred_Y_prob, axis=1) if test_Y.nunique() > 2 else (pred_Y_prob >= 0.5).astype(int)
    pred_Y_val = np.argmax(pred_Y_prob_val, axis=1) if val_Y.nunique() > 2 else (pred_Y_prob_val >= 0.5).astype(int)
    pred_Y_hold = np.argmax(pred_Y_prob_hold, axis=1) if hold_Y.nunique() > 2 else (pred_Y_prob_hold >= 0.5).astype(int)
    # --- 評価 ---
    acc = accuracy_score(test_Y, pred_Y_test)
    cm = confusion_matrix(test_Y, pred_Y_test)
    f1 = f1_score(test_Y, pred_Y_test,average='macro')
    # --- Feature Importance ---
    importance_df[f'importance_fold{f}'] = model.feature_importance(importance_type='gain')
    # --- 結果保存 ---
    res = {"accuracy" :acc,
           "f1":f1,
           "cm":cm,
           "fold":f,
           "pred_test_prob":pred_Y_prob,
           "pred_val_prob":pred_Y_prob_val,
            "pred_hold_prob":pred_Y_prob_hold,
           "pred_test":pred_Y_test,
           "pred_val":pred_Y_val,
            "pred_hold":pred_Y_hold,
              "true_test":test_Y,
                "true_val":val_Y,
                "true_hold":hold_Y
           } 
    results.append(res)
    print(f"Fold {f} - F1: {f1:.4f}, Accuracy: {acc:.4f}")

3
set()
set()
set()
set()
set()
Training until validation scores don't improve for 50 rounds
[200]	train's binary_error: 0.13992	val's binary_error: 0.234407
[400]	train's binary_error: 0.113273	val's binary_error: 0.22499
[600]	train's binary_error: 0.0956751	val's binary_error: 0.222058
Early stopping, best iteration is:
[569]	train's binary_error: 0.0978836	val's binary_error: 0.22191
Fold 0 - F1: 0.7874, Accuracy: 0.7885
Training until validation scores don't improve for 50 rounds
[200]	train's binary_error: 0.141374	val's binary_error: 0.231667
Early stopping, best iteration is:
[268]	train's binary_error: 0.129889	val's binary_error: 0.229855
Fold 1 - F1: 0.7798, Accuracy: 0.7803
Training until validation scores don't improve for 50 rounds
[200]	train's binary_error: 0.140455	val's binary_error: 0.200548
Early stopping, best iteration is:
[152]	train's binary_error: 0.151047	val's binary_error: 0.200115
Fold 2 - F1: 0.7852, Accuracy: 0.7860
Training until validation scores don't 

In [72]:
oof_wrong_idx = []
for res in results:
    val_true = res["true_hold"].reset_index(drop=True)
    val_pred = pd.Series(res["pred_hold"]).reset_index(drop=True)
    wrong_idx = val_true[val_true != val_pred].index.tolist()
    oof_wrong_idx.append(wrong_idx)
print(f"Total OOF wrong predictions: {len(oof_wrong_idx)}")
for i, idx in enumerate(oof_wrong_idx):
    print(f"Fold {i} - Wrong indices: {idx}")

Total OOF wrong predictions: 5
Fold 0 - Wrong indices: [2, 23, 43, 44, 45, 48, 50, 52, 55, 57, 60, 61, 63, 65, 66, 68, 69, 73, 74, 78, 79, 82, 83, 85, 86, 96, 105, 106, 108, 109, 110, 111, 112, 115, 116, 117, 118, 119, 120, 122, 123, 126, 128, 129, 130, 132, 133, 181, 182, 183, 184, 185, 186, 188, 198, 241, 290, 291, 292, 295, 296, 297, 298, 299, 300, 301, 302, 304, 305, 306, 307, 308, 309, 312, 313, 314, 315, 321, 327, 328, 329, 330, 339, 340, 347, 356, 357, 359, 400, 414, 416, 430, 433, 546, 547, 549, 720, 722, 723, 726, 727, 729, 730, 733, 734, 735, 736, 737, 738, 740, 741, 747, 748, 749, 750, 751, 752, 755, 756, 757, 760, 770, 780, 782, 783, 784, 785, 807, 808, 809, 810, 811, 812, 813, 815, 816, 817, 819, 821, 823, 825, 826, 827, 828, 829, 830, 831, 832, 879, 885, 895, 898, 904, 924, 926, 928, 930, 939, 940, 941, 945, 946, 947, 948, 949, 951, 953, 957, 1024, 1027, 1029, 1030, 1031, 1039, 1055, 1056, 1062, 1065, 1067, 1072, 1086, 1087, 1109, 1114, 1127, 1140, 1141, 1145, 1147, 1149,

In [73]:
data_2_X = []
data_2_Y = []
for f in range(5):
    hold_X, hold_Y = folds[f][2]
    wrong_idx = oof_wrong_idx[f]
    # data_2.append(val_X.iloc[wrong_idx])
    data_2_X.append(hold_X.iloc[wrong_idx])
    data_2_Y.append(hold_Y.iloc[wrong_idx])
# data_2_X = pd.concat(data_2_X).reset_index(drop=True)
# data_2_Y = pd.concat(data_2_Y).reset_index(drop=True)

In [74]:
len(data_2_X), len(data_2_Y)

(5, 5)

In [75]:
train_idx = [[0,1,2,3],[0,1,2,4],[0,1,3,4],[0,2,3,4],[1,2,3,4]]
val_idx = [4,3,2,1,0]
for f in range(5):
    data_2_train_X = pd.concat([data_2_X[i] for i in train_idx[f]]).reset_index(drop=True)
    data_2_train_Y = pd.concat([data_2_Y[i] for i in train_idx[f]]).reset_index(drop=True)
    data_2_val_X = data_2_X[val_idx[f]]
    data_2_val_Y = data_2_Y[val_idx[f]]
    lgb_train = lgb.Dataset(data_2_train_X, label=data_2_train_Y,weight=compute_sample_weight(class_weight='balanced', y=data_2_train_Y))
    lgb_hold = lgb.Dataset(data_2_val_X , label=data_2_val_Y) 
    model_2 = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_hold],
        valid_names=['train', 'hold'],
        num_boost_round=2000,
        callbacks=[lgb.early_stopping(stopping_rounds=50),lgb.log_evaluation(period=200)],
    )
    # --- 予測 ---
    pred_Y_prob = model_2.predict(test_X, num_iteration=model_2.best_iteration)
    pred_Y_test = np.argmax(pred_Y_prob, axis=1) if test_Y.nunique() > 2 else (pred_Y_prob >= 0.5).astype(int)
    pred_Y_hold = model_2.predict(data_2_val_X, num_iteration=model_2.best_iteration)
    pred_Y_hold = np.argmax(pred_Y_hold, axis=1) if data_2_val_Y.nunique() > 2 else (pred_Y_hold >= 0.5).astype(int)
    # --- 評価 ---
    acc = accuracy_score(test_Y, pred_Y_test)
    cm = confusion_matrix(test_Y, pred_Y_test)
    f1 = f1_score(test_Y, pred_Y_test,average='macro')
    acc_hold = accuracy_score(data_2_val_Y, pred_Y_hold)
    f1_hold = f1_score(data_2_val_Y, pred_Y_hold,average='macro')
    # --- Feature Importance ---
    # --- 結果保存 ---
    print(f"Fold {f} - F1: {f1:.4f}, Accuracy: {acc:.4f}, Hold F1: {f1_hold:.4f}, Hold Accuracy: {acc_hold:.4f}")

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[69]	train's binary_error: 0.187332	hold's binary_error: 0.402514
Fold 0 - F1: 0.2431, Accuracy: 0.2447, Hold F1: 0.5972, Hold Accuracy: 0.5975
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	train's binary_error: 0.247031	hold's binary_error: 0.398049
Fold 1 - F1: 0.2596, Accuracy: 0.2600, Hold F1: 0.6003, Hold Accuracy: 0.6020
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[46]	train's binary_error: 0.204396	hold's binary_error: 0.36534
Fold 2 - F1: 0.2494, Accuracy: 0.2510, Hold F1: 0.6345, Hold Accuracy: 0.6347
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[60]	train's binary_error: 0.196239	hold's binary_error: 0.339885
Fold 3 - F1: 0.2346, Accuracy: 0.2347, Hold F1: 0.6601, Hold Accuracy: 0.6601
Training until validation scores don't imp

In [79]:
distance_1 = np.abs(results[0]['pred_test_prob'] - 0.5)
distance_2 = np.abs(pred_Y_prob - 0.5)
final_result = np.stack([results[0]['pred_test'], pred_Y_test], axis=1)
print( final_result[:5]  )
final_pred =[final_result[i,int(distance_1[i] < distance_2[i])] for i in range(final_result.shape[0])]
# final_pred.shape

[[0 1]
 [0 1]
 [1 0]
 [0 1]
 [0 1]]


In [70]:
[int(distance_1[i] < distance_2[i]) for i in range(10)]

[0, 0, 1, 1, 0, 1, 0, 0, 0, 1]

In [67]:
final_pred[:5]

[0, 0, 1, 0, 0]

In [45]:
(distance_1 < distance_2)

array([False, False,  True, ..., False, False,  True])

In [89]:
final_pred_ = (np.array(final_pred) >= 0.5).astype(int)
final_acc = accuracy_score(test_Y, final_pred)
final_f1 = f1_score(test_Y, final_pred,average='macro')
print(f"Final F1: {final_f1:.4f}, Final Accuracy: {final_acc:.4f}")

Final F1: 0.7823, Final Accuracy: 0.7834


In [80]:
np.array(final_pred).mean()

0.534302415013075

In [None]:
np.abs(results[0]['pred_test_prob'] - 0.5)

In [93]:
final_pred_2 =[results[0]['pred_test_prob'][i]*(distance_1[i]*2) + pred_Y_prob[i]*(1-distance_1[i]*2) for i in range(final_result.shape[0])]
np.array(final_pred_2).mean()

0.5029223981844888

In [86]:
np.array(final_pred_2).shape

(26004,)

In [94]:
final_pred_2_ = (np.array(final_pred_2) >= 0.5).astype(int)
final_acc = accuracy_score(test_Y, final_pred_2_)
final_f1 = f1_score(test_Y, final_pred_2_,average='macro')
final_acc, final_f1

(0.7684586986617443, 0.767394380269367)