In [29]:
import warnings 
warnings.filterwarnings("ignore")

In [30]:
import pandas as pd
import os
import random
import numpy as np

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
from datetime import datetime, timezone, timedelta
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from wandb.lightgbm import wandb_callback, log_summary

In [31]:
#wandb_callback 수정 
from typing import TYPE_CHECKING, Callable
import wandb
from wandb.sdk.lib import telemetry as wb_telemetry

MINIMIZE_METRICS = [
    "l1",
    "l2",
    "rmse",
    "mape",
    "huber",
    "fair",
    "poisson",
    "gamma",
    "binary_logloss",
]

MAXIMIZE_METRICS = ["map", "auc", "average_precision"]

def _define_metric(data: str, metric_name: str) -> None:
    
    """Capture model performance at the best step.

    instead of the last step, of training in your `wandb.summary`
    """
    if "loss" in str.lower(metric_name):
        wandb.define_metric(f"{data}_{metric_name}", summary="min")
    elif str.lower(metric_name) in MINIMIZE_METRICS:
        wandb.define_metric(f"{data}_{metric_name}", summary="min")
    elif str.lower(metric_name) in MAXIMIZE_METRICS:
        wandb.define_metric(f"{data}_{metric_name}", summary="max")
        
def wandb_callback(log_params: bool = True, define_metric: bool = True) -> Callable:
    """Automatically integrates LightGBM with wandb.

    Arguments:
        log_params: (boolean) if True (default) logs params passed to lightgbm.train as W&B config
        define_metric: (boolean) if True (default) capture model performance at the best step, instead of the last step, of training in your `wandb.summary`

    Passing `wandb_callback` to LightGBM will:
      - log params passed to lightgbm.train as W&B config (default).
      - log evaluation metrics collected by LightGBM, such as rmse, accuracy etc to Weights & Biases
      - Capture the best metric in `wandb.summary` when `define_metric=True` (default).

    Use `log_summary` as an extension of this callback.

    Example:
        ```python
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            .
        }
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
                        valid_names=('validation'),
                        callbacks=[wandb_callback()])
        ```
    """
    log_params_list: "List[bool]" = [log_params]
    define_metric_list: "List[bool]" = [define_metric]

    def _init(env: "CallbackEnv") -> None:
        with wb_telemetry.context() as tel:
            tel.feature.lightgbm_wandb_callback = True

        wandb.config.update(env.params)
        log_params_list[0] = False

        if define_metric_list[0]:
            for i in range(len(env.evaluation_result_list)):
                data_type = env.evaluation_result_list[i][0]
                metric_name = env.evaluation_result_list[i][1]
                _define_metric(data_type, metric_name)

    def _callback(env: "CallbackEnv") -> None:
        if log_params_list[0]:
            _init(env)
        # eval_results: "Dict[str, Dict[str, List[Any]]]" = {}
        # recorder = lightgbm.record_evaluation(eval_results)
        # recorder(env)
        eval_results = {x[0]:{x[1:][0]:x[1:][1:]} for x in env.evaluation_result_list}

        for validation_key in eval_results.keys():
            for key in eval_results[validation_key].keys():
                 wandb.log(
                     {validation_key + "_" + key: eval_results[validation_key][key][0]},
                     commit=False,
                 )
        for item in eval_results:
            if len(item) == 4:
                wandb.log({f"{item[0]}_{item[1]}": item[2]}, commit=False)

        # Previous log statements use commit=False. This commits them.
        wandb.log({"iteration": env.iteration}, commit=True)

    return _callback

## Training

In [32]:
#경로에 맞게 수정
#X=pd.read_parquet('/data/ephemeral/level2-dkt-recsys-06/data/train_ppd_final_sfcv.parquet')
#test=pd.read_parquet('/data/ephemeral/level2-dkt-recsys-06/data/test_ppd_final.parquet')

In [33]:
data_dir = '/data/ephemeral/level2-dkt-recsys-06/data/' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'combined_train.csv') 

X = pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_v4_1.csv')
test =  pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_Test_v4_1.csv')



In [34]:
y=X["answerCode"]
g=X["userID"]

In [35]:
feat=[ 'userID','assessmentItemID','testId','KnowledgeTag',
       'SolvingTime','CumulativeTime',
       'Month','DayOfWeek','TimeOfDay',
       'problems_cumulative','problems_last7days','problems_last30days',
       'CumulativeUserProblemAnswerRate','CumulativeProblemCount',
       'ProblemAnswerRate','TagAnswerRate','CumulativeUserTagAnswerRate','TestAnswerRate',
       'categorize_solvingTime','categorize_ProblemAnswerRate','categorize_TagAnswerRate','categorize_TestAnswerRate'
]

In [36]:
params = {
    'objective': 'binary', 
    'metric': ['auc'],
    'device': 'cpu'
}

# LabelEncoder 적용
label_encoders = {}
for column in ['DayOfWeek', 'TimeOfDay', 'categorize_ProblemAnswerRate', 
               'categorize_TagAnswerRate', 'categorize_TestAnswerRate']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    # 테스트 데이터에 대해서는 transform만 적용
    test[column] = le.transform(test[column])
    label_encoders[column] = le


    
n_fold=5
sfcv=StratifiedGroupKFold(n_splits=n_fold)
oof_auc = np.zeros(n_fold)
oof_acc = np.zeros(n_fold)
test_preds = np.zeros(len(test))
# X = X.drop(columns=['Timestamp'])
# test = test.drop(columns=['Timestamp'])
for i, (train_idx, val_idx) in enumerate(sfcv.split(X, y, g)):
    print(f"Fold {i}:")
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid = X.iloc[val_idx]
    X_valid = X_valid[X_valid['userID'] != X_valid['userID'].shift(-1)]
    y_valid = X_valid["answerCode"]
    # print(X_valid)
    # print(y_valid)
    # break
    
    
    lgb_train = lgb.Dataset(X_train[feat], y_train, categorical_feature=["KnowledgeTag"])
    lgb_valid = lgb.Dataset(X_valid[feat], y_valid, categorical_feature=["KnowledgeTag"])
    wandb.init(project="LGBM", config=params)
    wandb.run.name = f"fold{i}lgbm"
    model = lgb.train(
        params, 
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=500,
        callbacks=[wandb_callback(), lgb.log_evaluation(), lgb.early_stopping(20)],
        categorical_feature=["KnowledgeTag"]
    )
    preds = model.predict(X_valid[feat])
    oof_acc[i] = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    
    oof_auc[i] = roc_auc_score(y_valid, preds)
    # 'Timestamp' 열이 실제로 test 데이터프레임에서 제거되었는지 확인합니다.
    if 'Timestamp' in test.columns:
        test = test.drop(columns=['Timestamp'])

    # 모델을 사용하여 예측을 수행하기 전에 test 데이터프레임의 컬럼을 확인합니다.
    test_features = [col for col in test.columns if col in feat]
    # 빈 리스트를 초기화합니다. 예측할 마지막 행의 인덱스를 저장할 것입니다.
    last_indices = []
    # 예측을 수행합니다.
    for uid in test['userID'].unique():
    # userID가 uid인 행들 중 마지막 행의 인덱스를 찾습니다.
        last_index = test[(test['userID'] == uid) & (test['answerCode'] == -1)].index[-1]
        last_indices.append(last_index)

    # 예측할 행들만 포함하는 새로운 DataFrame을 생성합니다.
    test_last = test.loc[last_indices]
    last_preds = model.predict(test_last[test_features])
    # 예측을 수행합니다.
    # 여기서는 모델이 'answerCode'를 예측하는 데 필요한 모든 피처를 사용하고 있다고 가정합니다.
    # test_preds에 해당 userID의 마지막 행의 예측값을 업데이트합니다.
    for idx, pred in zip(last_indices, last_preds):
        test_preds[idx] += pred / n_fold
    # test_preds += model.predict(test_last[test_features]) / n_fold
    # test_preds += model.predict(test[test_features]) / n_fold

    
    print(f'Fold {i} VALID AUC : {oof_auc[i]} ACC : {oof_acc[i]}\n')

Fold 0:




0,1
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
valid_1_auc,▁▃▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████

0,1
iteration,228


[LightGBM] [Info] Number of positive: 1322873, number of negative: 697897
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.114743 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3596
[LightGBM] [Info] Number of data points in the train set: 2020770, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654638 -> initscore=0.639490
[LightGBM] [Info] Start training from score 0.639490
[1]	training's auc: 0.815758	valid_1's auc: 0.779584
Training until validation scores don't improve for 20 rounds
[2]	training's auc: 0.818951	valid_1's auc: 0.783728
[3]	training's auc: 0.823073	valid_1's auc: 0.785622
[4]	training's auc: 0.824396	valid_1's auc: 0.786497
[5]	training's auc: 0.825577	valid_1's auc: 0.788166
[6]	training's auc: 0.826326	valid_1's auc: 0.788244
[7]	training's auc: 0.82718	valid_1's auc: 0.788858
[8]	training's auc: 0.827549	valid_1's auc: 0.789922
[9]	training's auc:



0,1
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████████
valid_1_auc,▁▃▅▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████████████

0,1
iteration,228


[LightGBM] [Info] Number of positive: 1322899, number of negative: 697890
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023895 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3600
[LightGBM] [Info] Number of data points in the train set: 2020789, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654645 -> initscore=0.639519
[LightGBM] [Info] Start training from score 0.639519
[1]	training's auc: 0.815787	valid_1's auc: 0.777028
Training until validation scores don't improve for 20 rounds
[2]	training's auc: 0.820279	valid_1's auc: 0.785055
[3]	training's auc: 0.8234	valid_1's auc: 0.787929
[4]	training's auc: 0.824691	valid_1's auc: 0.790848
[5]	training's auc: 0.82617	valid_1's auc: 0.792507
[6]	training's auc: 0.826795	valid_1's auc: 0.793412
[7]	training's auc: 0.8277	valid_1's auc: 0.794527
[8]	training



0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
training_auc,▁▃▃▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████
valid_1_auc,▁▃▄▅▅▆▆▆▇▇▇▇▇▇▇▇████████████████████████

0,1
iteration,187


[LightGBM] [Info] Number of positive: 1322856, number of negative: 697893
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027924 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3602
[LightGBM] [Info] Number of data points in the train set: 2020749, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654636 -> initscore=0.639483
[LightGBM] [Info] Start training from score 0.639483
[1]	training's auc: 0.816293	valid_1's auc: 0.782047
Training until validation scores don't improve for 20 rounds
[2]	training's auc: 0.820871	valid_1's auc: 0.789558
[3]	training's auc: 0.823669	valid_1's auc: 0.793849
[4]	training's auc: 0.824753	valid_1's auc: 0.795022
[5]	training's auc: 0.826014	valid_1's auc: 0.796207
[6]	training's auc: 0.827253	valid_1's auc: 0.796942
[7]	training's auc: 0.827971	valid_1's auc: 0.797555
[8]	tra



0,1
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▂▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇███████████
valid_1_auc,▁▃▅▆▆▆▆▇▇▇▇▇▇▇███▇██████████████████████

0,1
iteration,76


[LightGBM] [Info] Number of positive: 1322858, number of negative: 697894
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3598
[LightGBM] [Info] Number of data points in the train set: 2020752, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654636 -> initscore=0.639483
[LightGBM] [Info] Start training from score 0.639483
[1]	training's auc: 0.815499	valid_1's auc: 0.77696
Training until validation scores don't improve for 20 rounds
[2]	training's auc: 0.820063	valid_1's auc: 0.784062
[3]	training's auc: 0.823078	valid_1's auc: 0.787882
[4]	training's auc: 0.824626	valid_1's auc: 0.791972
[5]	training's auc: 0.825682	valid_1's auc: 0.792895
[6]	training's auc: 0.826469	valid_1's auc: 0.79535
[7]	training's auc: 0.827045	valid_1's auc: 0.795121
[8]	train



0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇████████████
valid_1_auc,▁▄▅▅▆▆▇▇▇▇▇▇▇▇▇█████████████████████████

0,1
iteration,95


[LightGBM] [Info] Number of positive: 1322866, number of negative: 697898
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.118069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3599
[LightGBM] [Info] Number of data points in the train set: 2020764, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654637 -> initscore=0.639483
[LightGBM] [Info] Start training from score 0.639483
[1]	training's auc: 0.816828	valid_1's auc: 0.785699
Training until validation scores don't improve for 20 rounds
[2]	training's auc: 0.820004	valid_1's auc: 0.792052
[3]	training's auc: 0.822402	valid_1's auc: 0.792518
[4]	training's auc: 0.824578	valid_1's auc: 0.795087
[5]	training's auc: 0.825708	valid_1's auc: 0.794851
[6]	training's auc: 0.826849	valid_1's auc: 0.795204
[7]	training's auc: 0.82776	valid_1's auc: 0.795136
[8]	training's auc: 0.828289	valid_1's auc: 0.795257
[9]	training's auc:

In [39]:
np.mean(oof_auc), np.mean(oof_acc)

(0.805614173171351, 0.7264321254674632)

In [41]:
output_dir = 'output/'
write_path = os.path.join(output_dir, datetime.now(timezone(timedelta(hours=9))).strftime("%Y-%m-%d %H:%M:%S")+" lgbm submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(test_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/2024-01-20 06:42:44 lgbm submission.csv


In [None]:
# 'prediction' 값이 0이 아닌 행만 필터링
final_df = non_zero_predictions_df.assign(id=non_zero_predictions_df.index)
