In [14]:
import warnings 
warnings.filterwarnings("ignore")

In [15]:
import pandas as pd
import os
import random
import numpy as np
import yaml
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
from datetime import datetime, timezone, timedelta
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from wandb.lightgbm import wandb_callback, log_summary

In [16]:
#wandb_callback 수정 
from typing import TYPE_CHECKING, Callable
import wandb
from wandb.sdk.lib import telemetry as wb_telemetry

MINIMIZE_METRICS = [
    "l1",
    "l2",
    "rmse",
    "mape",
    "huber",
    "fair",
    "poisson",
    "gamma",
    "binary_logloss",
]

MAXIMIZE_METRICS = ["map", "mean_auc", "average_precision"]

# def _define_metric(data: str, metric_name: str) -> None:
    
#     """Capture model performance at the best step.

#     instead of the last step, of training in your `wandb.summary`
#     """
#     if "loss" in str.lower(metric_name):
#         wandb.define_metric(f"{data}_{metric_name}", summary="min")
#     elif str.lower(metric_name) in MINIMIZE_METRICS:
#         wandb.define_metric(f"{data}_{metric_name}", summary="min")
#     elif str.lower(metric_name) in MAXIMIZE_METRICS:
#         wandb.define_metric(f"{data}_{metric_name}", summary="max")
        
def wandb_callback(log_params=True, define_metric=True) -> Callable:
    """Automatically integrates LightGBM with wandb.

    Arguments:
        log_params: (boolean) if True (default) logs params passed to lightgbm.train as W&B config
        define_metric: (boolean) if True (default) capture model performance at the best step, instead of the last step, of training in your `wandb.summary`

    Passing `wandb_callback` to LightGBM will:
      - log params passed to lightgbm.train as W&B config (default).
      - log evaluation metrics collected by LightGBM, such as rmse, accuracy etc to Weights & Biases
      - Capture the best metric in `wandb.summary` when `define_metric=True` (default).

    Use `log_summary` as an extension of this callback.

    Example:
        ```python
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            .
        }
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
                        valid_names=('validation'),
                        callbacks=[wandb_callback()])
        ```
    """
    def _define_metric(data: str, metric_name: str) -> None:
    
        """Capture model performance at the best step.
        instead of the last step, of training in your `wandb.summary`
        """
        if "loss" in str.lower(metric_name):
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MINIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MAXIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="max")
            
    log_params_list: "List[bool]" = [log_params]
    define_metric_list: "List[bool]" = [define_metric]

    def _init(env: "CallbackEnv") -> None:
        with wb_telemetry.context() as tel:
            tel.feature.lightgbm_wandb_callback = True

        wandb.config.update(env.params)
        log_params_list[0] = False

        if define_metric_list[0]:
            for i in range(len(env.evaluation_result_list)):
                data_type = env.evaluation_result_list[i][0]
                metric_name = env.evaluation_result_list[i][1]
                _define_metric(data_type, metric_name)

    def _callback(env: "CallbackEnv") -> None:
        if log_params_list[0]:
            _init(env)
        # eval_results: "Dict[str, Dict[str, List[Any]]]" = {}
        # recorder = lightgbm.record_evaluation(eval_results)
        # recorder(env)
        eval_results = {x[0]:{x[1:][0]:x[1:][1:]} for x in env.evaluation_result_list}

        for validation_key in eval_results.keys():
            for key in eval_results[validation_key].keys():
                 wandb.log(
                     {validation_key + "_" + key: eval_results[validation_key][key][0]},
                     commit=False,
                 )
        for item in eval_results:
            if len(item) == 4:
                wandb.log({f"{item[0]}_{item[1]}": item[2]}, commit=False)

        # Previous log statements use commit=False. This commits them.
        wandb.log({"iteration": env.iteration}, commit=True)

    return _callback

## Training

In [18]:
sweep_config_path = '/data/ephemeral/level2-dkt-recsys-06/code/boost/lgbmsweepconfigv2.yaml'

# YAML 파일 로드
with open(sweep_config_path, 'r') as file:
    sweep_config = yaml.safe_load(file)

# W&B 스위프트 설정
sweep_id = wandb.sweep(sweep=sweep_config, project="lightgbm-sweep")


Create sweep with ID: n0no616h
Sweep URL: https://wandb.ai/boostcamp6-recsys6/lightgbm-sweep/sweeps/n0no616h


In [19]:

# 노트북의 이름 설정
os.environ['WANDB_NOTEBOOK_NAME'] = 'LGBM_Train.ipynb'


X = pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_v4_2.csv')
test =  pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_Test_v4_2.csv')
# 유저 최근데이터만 사용
#X = X.groupby('userID').tail(10)


#X = X[X['answerCode'] != -1]


test = test[test["userID"] != test["userID"].shift(-1)]
test = test.drop(["answerCode"], axis=1)

# X.shape
print(X.shape)
print(test.shape)

X.head()
test.head()


(2525956, 23)
(744, 22)


Unnamed: 0,userID,assessmentItemID,testId,KnowledgeTag,SolvingTime,CumulativeTime,Month,DayOfWeek,TimeOfDay,problems_cumulative,...,CumulativeUserProblemAnswerRate,CumulativeProblemCount,ProblemAnswerRate,TagAnswerRate,CumulativeUserTagAnswerRate,TestAnswerRate,categorize_solvingTime,categorize_ProblemAnswerRate,categorize_TagAnswerRate,categorize_TestAnswerRate
1035,3,50133008,50133,5289,45,361,10,Monday,Afternoon,1035,...,69,290,52,54,81,66,6,Difficult,Very Difficult,Medium
1706,4,70146008,70146,9080,24,196,12,Sunday,Dawn,670,...,69,28,53,56,66,74,4,Difficult,Difficult,Easy
3023,13,70111008,70111,9660,14,118,12,Sunday,Dawn,1316,...,69,34,31,44,33,41,2,Extremely Difficult,Extremely Difficult,Extremely Difficult
4283,17,90064006,90064,2611,76,456,10,Friday,Dawn,1259,...,81,624,37,51,100,62,7,Extremely Difficult,Very Difficult,Difficult
4670,26,60135007,60135,1422,45,320,10,Friday,Morning,386,...,75,178,35,60,66,67,6,Extremely Difficult,Difficult,Medium


In [20]:
label = X["answerCode"]
g=X["userID"]

In [21]:
feat=[ 'userID','assessmentItemID','testId','KnowledgeTag',
       'SolvingTime','CumulativeTime',
       'Month','DayOfWeek','TimeOfDay',
       'problems_cumulative','problems_last7days','problems_last30days',
       'CumulativeUserProblemAnswerRate','CumulativeProblemCount',
       'ProblemAnswerRate','TagAnswerRate','CumulativeUserTagAnswerRate','TestAnswerRate',
       'categorize_solvingTime','categorize_ProblemAnswerRate','categorize_TagAnswerRate','categorize_TestAnswerRate'
]

In [22]:
default_config = {
    "num_leaves": 10,  # 최소값 10
    "learning_rate": 0.0001,  # 최소값 0.0001
    "max_depth": -1,  # 최소값 -1 (깊이 제한 없음)
    "min_data_in_leaf": 20,  # 최소값 20
    "feature_fraction": 0.6,  # 최소값 0.6
    "bagging_fraction": 0.6,  # 최소값 0.6
    "bagging_freq": 0,  # 최소값 0
    "lambda_l1": 0.0,  # 최소값 0.0
    "lambda_l2": 0.0,  # 최소값 0.0
    "cat_smooth": 10,  # 최소값 10
}

# LabelEncoder 적용


label_encoders = {}
for column in [
    "DayOfWeek",
    "TimeOfDay",
    "categorize_ProblemAnswerRate",
    "categorize_TagAnswerRate",
    "categorize_TestAnswerRate",
]:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    # 테스트 데이터에 대해서는 transform만 적용
    test[column] = le.transform(test[column])


# n_fold = 10

# sfcv = StratifiedGroupKFold(n_splits=n_fold)


def train():
    # HPPJ = 1
    auc = 0
    acc = 0
    test_preds = np.zeros(len(test))
    # X = X.drop(columns=['Timestamp'])
    # test = test.drop(columns=['Timestamp'])

    # userID별 마지막 인덱스 찾기
    last_indices = X.groupby("userID").tail(1).index

    # 검증 데이터셋 생성
    X_valid = X.loc[last_indices]
    y_valid = X_valid["answerCode"]

    # 학습 데이터셋 생성
    X_train = X.drop(last_indices)
    y_train = X_train["answerCode"]

    # 학습 및 검증 데이터셋 크기 확인
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")

    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_valid shape: {X_valid.shape}, y_valid shape: {y_valid.shape}")
    # X_valid = X_valid[X_valid["userID"] != X_valid["userID"].shift(-1)]
    # y_valid = X_valid["answerCode"]
    # print(X_valid)
    # print(y_valid)
    # break ["KnowledgeTag",'userID', 'assessmentItemID', 'Month', 'testId']

    lgb_train = lgb.Dataset(
        X_train[feat], y_train, categorical_feature=["KnowledgeTag", "Month"]
    )
    lgb_valid = lgb.Dataset(
        X_valid[feat], y_valid, categorical_feature=["KnowledgeTag", "Month"]
    )

    wandb.init(project=f"lightgbm-sweep", config=default_config)
    wandb.run.name = f"nofoldlgbm"
    current_params = {
        "objective": "binary",
        "metric": ["auc"],
        "device": "cpu",
        "num_leaves": wandb.config.num_leaves,
        "learning_rate": wandb.config.learning_rate,
        "max_depth": wandb.config.max_depth,
        "min_data_in_leaf": wandb.config.min_data_in_leaf,
        "feature_fraction": wandb.config.feature_fraction,
        "bagging_fraction": wandb.config.bagging_fraction,
        "bagging_freq": wandb.config.bagging_freq,
        "lambda_l1": wandb.config.lambda_l1,
        "lambda_l2": wandb.config.lambda_l2,
        "cat_smooth": wandb.config.cat_smooth,
    }
    model = lgb.train(
        current_params,
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=50000,
        callbacks=[
            wandb_callback(log_params=True, define_metric=True),
            lgb.early_stopping(100),
        ],
        categorical_feature=["KnowledgeTag"],
    )
    preds = model.predict(X_valid[feat])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)
    test_preds += model.predict(test)
    print(f"VALID AUC : {auc} ACC : {acc}\n")
    wandb.log({"auc": auc, "accuracy": acc})
    output_dir = "output/"
    write_path = os.path.join(
        output_dir,
        # datetime.now(timezone(timedelta(hours=9))).strftime("%Y-%m-%d %H:%M:%S")
        f"auc:{auc} acc:{acc}"
        + "sweep"
        + " lgbm.csv",
    )
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(write_path, "w", encoding="utf8") as w:
        print("writing prediction : {}".format(write_path))
        w.write("id,prediction\n")
        for id, p in enumerate(test_preds):
            w.write("{},{}\n".format(id, p))

In [23]:

wandb.agent(sweep_id, train)



<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


Exception in thread Thread-114 (_run_job):
Traceback (most recent call last):
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/agents/pyagent.py", line 298, in _run_job
    self._function()
  File "/tmp/ipykernel_389612/1374679570.py", line 91, in train
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/lightgbm/engine.py", line 286, in train
    cb(callback.CallbackEnv(model=booster,
  File "/tmp/ipykernel_389612/4041246805.py", line 100, in _callback
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 420, in wrapper
    return func(self, *args, **kwargs)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 371, in wrapper_fn
    return func(self, *args, **kwargs)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 361, in wrapper
    return func(self, *args, **kwargs)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 1820, i

X_train shape: (2518514, 23), y_train shape: (2518514,)
X_valid shape: (7442, 23), y_valid shape: (7442,)
X_train shape: (2518514, 23), y_train shape: (2518514,)
X_valid shape: (7442, 23), y_valid shape: (7442,)


Traceback (most recent call last):
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1172, in init
    wi.setup(kwargs)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 225, in setup
    with telemetry.context(obj=self._init_telemetry_obj) as tel:
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/lib/telemetry.py", line 42, in __exit__
    self._run._telemetry_callback(self._obj)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 758, in _telemetry_callback
    self._telemetry_flush()
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 769, in _telemetry_flush
    self._backend.interface._publish_telemetry(self._telemetry_obj)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/interface/interface_shared.py", line 101, in _publish_telemetry
    self._publish(rec)
  File "/opt/conda/envs/dkt/lib/python3.10/sit

X_train shape: (2518514, 23), y_train shape: (2518514,)
X_valid shape: (7442, 23), y_valid shape: (7442,)
X_train shape: (2518514, 23), y_train shape: (2518514,)
X_valid shape: (7442, 23), y_valid shape: (7442,)


Traceback (most recent call last):
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1172, in init
    wi.setup(kwargs)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 225, in setup
    with telemetry.context(obj=self._init_telemetry_obj) as tel:
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/lib/telemetry.py", line 42, in __exit__
    self._run._telemetry_callback(self._obj)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 758, in _telemetry_callback
    self._telemetry_flush()
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 769, in _telemetry_flush
    self._backend.interface._publish_telemetry(self._telemetry_obj)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/interface/interface_shared.py", line 101, in _publish_telemetry
    self._publish(rec)
  File "/opt/conda/envs/dkt/lib/python3.10/sit

X_train shape: (2518514, 23), y_train shape: (2518514,)
X_valid shape: (7442, 23), y_valid shape: (7442,)
X_train shape: (2518514, 23), y_train shape: (2518514,)
X_valid shape: (7442, 23), y_valid shape: (7442,)


Traceback (most recent call last):
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1172, in init
    wi.setup(kwargs)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 225, in setup
    with telemetry.context(obj=self._init_telemetry_obj) as tel:
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/lib/telemetry.py", line 42, in __exit__
    self._run._telemetry_callback(self._obj)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 758, in _telemetry_callback
    self._telemetry_flush()
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 769, in _telemetry_flush
    self._backend.interface._publish_telemetry(self._telemetry_obj)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/interface/interface_shared.py", line 101, in _publish_telemetry
    self._publish(rec)
  File "/opt/conda/envs/dkt/lib/python3.10/sit

X_train shape: (2518514, 23), y_train shape: (2518514,)
X_valid shape: (7442, 23), y_valid shape: (7442,)
X_train shape: (2518514, 23), y_train shape: (2518514,)
X_valid shape: (7442, 23), y_valid shape: (7442,)


Traceback (most recent call last):
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1172, in init
    wi.setup(kwargs)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 225, in setup
    with telemetry.context(obj=self._init_telemetry_obj) as tel:
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/lib/telemetry.py", line 42, in __exit__
    self._run._telemetry_callback(self._obj)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 758, in _telemetry_callback
    self._telemetry_flush()
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/wandb_run.py", line 769, in _telemetry_flush
    self._backend.interface._publish_telemetry(self._telemetry_obj)
  File "/opt/conda/envs/dkt/lib/python3.10/site-packages/wandb/sdk/interface/interface_shared.py", line 101, in _publish_telemetry
    self._publish(rec)
  File "/opt/conda/envs/dkt/lib/python3.10/sit