In [13]:
import warnings 
warnings.filterwarnings("ignore")

In [14]:
import pandas as pd
import os
import random
import numpy as np

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
from datetime import datetime, timezone, timedelta
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from wandb.lightgbm import wandb_callback, log_summary

In [15]:
#wandb_callback 수정 
from typing import TYPE_CHECKING, Callable
import wandb
from wandb.sdk.lib import telemetry as wb_telemetry

MINIMIZE_METRICS = [
    "l1",
    "l2",
    "rmse",
    "mape",
    "huber",
    "fair",
    "poisson",
    "gamma",
    "binary_logloss",
]

MAXIMIZE_METRICS = ["map", "auc", "average_precision"]

def _define_metric(data: str, metric_name: str) -> None:
    
    """Capture model performance at the best step.

    instead of the last step, of training in your `wandb.summary`
    """
    if "loss" in str.lower(metric_name):
        wandb.define_metric(f"{data}_{metric_name}", summary="min")
    elif str.lower(metric_name) in MINIMIZE_METRICS:
        wandb.define_metric(f"{data}_{metric_name}", summary="min")
    elif str.lower(metric_name) in MAXIMIZE_METRICS:
        wandb.define_metric(f"{data}_{metric_name}", summary="max")
        
def wandb_callback(log_params: bool = True, define_metric: bool = True) -> Callable:
    """Automatically integrates LightGBM with wandb.

    Arguments:
        log_params: (boolean) if True (default) logs params passed to lightgbm.train as W&B config
        define_metric: (boolean) if True (default) capture model performance at the best step, instead of the last step, of training in your `wandb.summary`

    Passing `wandb_callback` to LightGBM will:
      - log params passed to lightgbm.train as W&B config (default).
      - log evaluation metrics collected by LightGBM, such as rmse, accuracy etc to Weights & Biases
      - Capture the best metric in `wandb.summary` when `define_metric=True` (default).

    Use `log_summary` as an extension of this callback.

    Example:
        ```python
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            .
        }
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
                        valid_names=('validation'),
                        callbacks=[wandb_callback()])
        ```
    """
    log_params_list: "List[bool]" = [log_params]
    define_metric_list: "List[bool]" = [define_metric]

    def _init(env: "CallbackEnv") -> None:
        with wb_telemetry.context() as tel:
            tel.feature.lightgbm_wandb_callback = True

        wandb.config.update(env.params)
        log_params_list[0] = False

        if define_metric_list[0]:
            for i in range(len(env.evaluation_result_list)):
                data_type = env.evaluation_result_list[i][0]
                metric_name = env.evaluation_result_list[i][1]
                _define_metric(data_type, metric_name)

    def _callback(env: "CallbackEnv") -> None:
        if log_params_list[0]:
            _init(env)
        # eval_results: "Dict[str, Dict[str, List[Any]]]" = {}
        # recorder = lightgbm.record_evaluation(eval_results)
        # recorder(env)
        eval_results = {x[0]:{x[1:][0]:x[1:][1:]} for x in env.evaluation_result_list}

        for validation_key in eval_results.keys():
            for key in eval_results[validation_key].keys():
                 wandb.log(
                     {validation_key + "_" + key: eval_results[validation_key][key][0]},
                     commit=False,
                 )
        for item in eval_results:
            if len(item) == 4:
                wandb.log({f"{item[0]}_{item[1]}": item[2]}, commit=False)

        # Previous log statements use commit=False. This commits them.
        wandb.log({"iteration": env.iteration}, commit=True)

    return _callback

## Training

In [16]:
#경로에 맞게 수정
#X=pd.read_parquet('/data/ephemeral/level2-dkt-recsys-06/data/train_ppd_final_sfcv.parquet')
#test=pd.read_parquet('/data/ephemeral/level2-dkt-recsys-06/data/test_ppd_final.parquet')

In [17]:
data_dir = '/data/ephemeral/level2-dkt-recsys-06/data/' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'combined_train.csv') 

X = pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_v4_1.csv')
test =  pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_Test_v4_1.csv')



In [18]:
y=X["answerCode"]
g=X["userID"]

In [19]:
feat=[ 'userID','assessmentItemID','testId','KnowledgeTag',
       'SolvingTime','CumulativeTime',
       'Month','DayOfWeek','TimeOfDay',
       'problems_cumulative','problems_last7days','problems_last30days',
       'CumulativeUserProblemAnswerRate','CumulativeProblemCount',
       'ProblemAnswerRate','TagAnswerRate','CumulativeUserTagAnswerRate','TestAnswerRate',
       'categorize_solvingTime','categorize_ProblemAnswerRate','categorize_TagAnswerRate','categorize_TestAnswerRate'
]

In [29]:
params = {
    'objective': 'binary', 
    'metric': ['auc'],
    'device': 'cpu'
}

# LabelEncoder 적용
label_encoders = {}
for column in ['DayOfWeek', 'TimeOfDay', 'categorize_ProblemAnswerRate', 
               'categorize_TagAnswerRate', 'categorize_TestAnswerRate']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    # 테스트 데이터에 대해서는 transform만 적용
    test[column] = le.fit_transform(test[column])
    label_encoders[column] = le


    
n_fold=5
sfcv=StratifiedGroupKFold(n_splits=n_fold)
oof_auc = np.zeros(n_fold)
oof_acc = np.zeros(n_fold)
test_preds = np.zeros(len(test))

for i, (train_idx, val_idx) in enumerate(sfcv.split(X, y, g)):
    print(f"Fold {i}:")
    print(X[X["answerCode"] == -1.0])
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid = X.iloc[val_idx]
    X_valid = X_valid[X_valid['userID'] != X_valid['userID'].shift(-1)]
    y_valid = X_valid["answerCode"]
    print(X_valid)
    print(y_valid)
    break
    lgb_train = lgb.Dataset(X_train[feat], y_train, categorical_feature=["KnowledgeTag"])
    lgb_valid = lgb.Dataset(X_valid[feat], y_valid, categorical_feature=["KnowledgeTag"])
    wandb.init(project="LGBM", config=params)
    wandb.run.name = f"fold{i}lgbm"
    model = lgb.train(
        params, 
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=500,
        callbacks=[wandb_callback(), lgb.log_evaluation(), lgb.early_stopping(20)],
        categorical_feature=["KnowledgeTag"]
    )
    preds = model.predict(X_valid[feat])
    oof_acc[i] = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    
    oof_auc[i] = roc_auc_score(y_valid, preds)
    
    test_preds += model.predict(test) / n_fold
    
    print(f'Fold {i} VALID AUC : {oof_auc[i]} ACC : {oof_acc[i]}\n')

Fold 0:
         userID  assessmentItemID  testId  answerCode            Timestamp  \
2989          3          50133008   50133        -1.0  2020-10-26 13:13:57   
3660          4          70146008   70146        -1.0  2020-12-27 02:47:54   
10860        13          70111008   70111        -1.0  2020-12-27 04:35:09   
15278        17          90064006   90064        -1.0  2020-10-30 05:48:37   
23531        26          60135007   60135        -1.0  2020-10-23 11:44:18   
...         ...               ...     ...         ...                  ...   
2525938    7395          40122005   40122        -1.0  2020-09-08 02:05:20   
2526081    7404          30111005   30111        -1.0  2020-10-13 09:49:18   
2526282    7416          50193004   50193        -1.0  2020-10-04 02:44:41   
2526297    7417          50193004   50193        -1.0  2020-09-06 13:09:15   
2526675    7439          40130005   40130        -1.0  2020-10-14 23:10:03   

         KnowledgeTag  SolvingTime  CumulativeTime  Mon

In [21]:
np.mean(oof_auc), np.mean(oof_acc)

(0.0, 0.1286573146292585)

In [None]:
output_dir = 'output/'
write_path = os.path.join(output_dir, datetime.now(timezone(timedelta(hours=9))).strftime("%Y-%m-%d %H:%M:%S")+" lgbm submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(test_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/2024-01-18 23:16:09 lgbm submission.csv
