In [1]:
import warnings 
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import os
import random
import numpy as np
import yaml
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
import torch
from datetime import datetime, timezone, timedelta
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from wandb.lightgbm import wandb_callback, log_summary

In [3]:
#wandb_callback 수정 
from typing import TYPE_CHECKING, Callable
import wandb
from wandb.sdk.lib import telemetry as wb_telemetry

MINIMIZE_METRICS = [
    "l1",
    "l2",
    "rmse",
    "mape",
    "huber",
    "fair",
    "poisson",
    "gamma",
    "binary_logloss",
]

MAXIMIZE_METRICS = ["map", "auc", "average_precision"]

def set_seeds(seed: int = 42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
        
def wandb_callback(log_params=True, define_metric=True) -> Callable:
    """Automatically integrates LightGBM with wandb.

    Arguments:
        log_params: (boolean) if True (default) logs params passed to lightgbm.train as W&B config
        define_metric: (boolean) if True (default) capture model performance at the best step, instead of the last step, of training in your `wandb.summary`

    Passing `wandb_callback` to LightGBM will:
      - log params passed to lightgbm.train as W&B config (default).
      - log evaluation metrics collected by LightGBM, such as rmse, accuracy etc to Weights & Biases
      - Capture the best metric in `wandb.summary` when `define_metric=True` (default).

    Use `log_summary` as an extension of this callback.

    Example:
        ```python
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            .
        }
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
                        valid_names=('validation'),
                        callbacks=[wandb_callback()])
        ```
    """
    def _define_metric(data: str, metric_name: str) -> None:
    
        """Capture model performance at the best step.
        instead of the last step, of training in your `wandb.summary`
        """
        if "loss" in str.lower(metric_name):
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MINIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MAXIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="max")
            
    log_params_list: "List[bool]" = [log_params]
    define_metric_list: "List[bool]" = [define_metric]

    def _init(env: "CallbackEnv") -> None:
        with wb_telemetry.context() as tel:
            tel.feature.lightgbm_wandb_callback = True

        wandb.config.update(env.params)
        log_params_list[0] = False

        if define_metric_list[0]:
            for i in range(len(env.evaluation_result_list)):
                data_type = env.evaluation_result_list[i][0]
                metric_name = env.evaluation_result_list[i][1]
                _define_metric(data_type, metric_name)

    def _callback(env: "CallbackEnv") -> None:
        if log_params_list[0]:
            _init(env)
        # eval_results: "Dict[str, Dict[str, List[Any]]]" = {}
        # recorder = lightgbm.record_evaluation(eval_results)
        # recorder(env)
        eval_results = {x[0]:{x[1:][0]:x[1:][1:]} for x in env.evaluation_result_list}

        for validation_key in eval_results.keys():
            for key in eval_results[validation_key].keys():
                 wandb.log(
                     {validation_key + "_" + key: eval_results[validation_key][key][0]},
                     commit=False,
                 )
        for item in eval_results:
            if len(item) == 4:
                wandb.log({f"{item[0]}_{item[1]}": item[2]}, commit=False)

        # Previous log statements use commit=False. This commits them.
        wandb.log({"iteration": env.iteration}, commit=True)

    return _callback

## Training

In [4]:
sweep_config_path = '/data/ephemeral/level2-dkt-recsys-06/code/boost/lgbmsweepconfigv2.yaml'

# 노트북의 이름 설정

os.environ['WANDB_NOTEBOOK_NAME'] = 'LGBM_Train.ipynb'
# YAML 파일 로드
with open(sweep_config_path, 'r') as file:
    sweep_config = yaml.safe_load(file)

# W&B 스위프트 설정
sweep_id = wandb.sweep(sweep=sweep_config, project="lightgbm-sweep")

# 시드 고정
set_seeds()


Create sweep with ID: r2twjdzb
Sweep URL: https://wandb.ai/boostcamp6-recsys6/lightgbm-sweep/sweeps/r2twjdzb


In [5]:
X = pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_v7.csv')
test =  pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_test_v7.csv')
X = X.sort_values(by=["userID", "Timestamp", "assessmentItemID"]).reset_index(drop=True)
test = test.sort_values(by=["userID", "Timestamp", "assessmentItemID"]).reset_index(drop=True)

# test = test[test["userID"] != test["userID"].shift(-1)]
# test = test.drop(["answerCode"], axis=1)

# 유저 아이디 갯수 구하기
unique_user_count = X['userID'].nunique()

# 유저아이디 갯수 출력
print(f'userIDnum {unique_user_count}')

# 유저아이디 마지막 행 구하기
last_rows = test.groupby('userID').tail(1)
print(last_rows)
# 마지막 행의 answerCode가 -1인지 확인
are_last_answers_minus_one = (last_rows['answerCode'] == -1).all()

print(are_last_answers_minus_one)

# test 모양
print(test.shape)



userIDnum 7442
        userID  assessmentItemID  testId  answerCode            Timestamp  \
1035         3          50133008   50133          -1  2020-10-26 13:13:57   
1706         4          70146008   70146          -1  2020-12-27 02:47:54   
3023        13          70111008   70111          -1  2020-12-27 04:35:09   
4283        17          90064006   90064          -1  2020-10-30 05:48:37   
4670        26          60135007   60135          -1  2020-10-23 11:44:18   
...        ...               ...     ...         ...                  ...   
260052    7395          40122005   40122          -1  2020-09-08 02:05:20   
260067    7404          30111005   30111          -1  2020-10-13 09:49:18   
260082    7416          50193004   50193          -1  2020-10-04 02:44:41   
260097    7417          50193004   50193          -1  2020-09-06 13:09:15   
260113    7439          40130005   40130          -1  2020-10-14 23:10:03   

        KnowledgeTag  Itemseq  SolvingTime  CumulativeTime  

In [6]:
test = test[test["answerCode"] == -1]
X = X[X['answerCode']!=-1]

In [7]:

# LabelEncoder 적용

label_encoders = {}
for column in [
    "categorize_ItemAnswerRate",
    "categorize_TagAnswerRate",
    "categorize_TestAnswerRate",
    "categorize_CumulativeUserItemAnswerRate",
    "categorize_CumulativeUserTagAverageAnswerRate",
    "categorize_CumulativeUserTagExponentialAverage",
    "DayOfWeek",
    "TimeOfDay",
    "categorize_TagAnswerRate",    
    "UserRecentTagAnswer",
    "PreviousItemAnswer",
    "categorize_TestAnswerRate",
]:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    # 테스트 데이터에 대해서는 transform만 적용
    test[column] = le.transform(test[column])
    label_encoders[column] = le


In [8]:
# print(X.shape)

# # 원-핫 인코딩 적용할 컬럼 선택
# columns_to_encode = [
#     "UserRecentTagAnswer",
#     "PreviousItemAnswer",
#     # 추가적으로 원-핫 인코딩을 적용할 다른 컬럼들을 여기에 추가
# ]
# for column in columns_to_encode:
#     if column in X.columns:
#         X = pd.get_dummies(X, columns=[column])
#     if column in test.columns:
#         test = pd.get_dummies(test, columns=[column])

#         # 다른 필드들에 대해서도 동일하게 적용
# print(X.shape)

# 라벨로 이동 


In [9]:
feat = X.columns.tolist()

exclude_columns = [
    "Timestamp",
    "answerCode",
    "DayOfWeek",
    'WeekOfYear',
    'UserAvgSolvingTime',
    'PastItemCount',
    "user_tag_total_answer",
    "categorize_CumulativeUserTagExponentialAverage",
    'categorize_CumulativeUserTagAverageAnswerRate',
    "categorize_TestAnswerRate",
    "categorize_TagAnswerRate"
]

filtered_feat = [column for column in feat if column not in exclude_columns]

print(X[feat].shape)
print(X[filtered_feat].shape)
print(X.columns.tolist())

(2525956, 54)
(2525956, 43)
['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'Itemseq', 'SolvingTime', 'CumulativeTime', 'Month', 'DayOfWeek', 'TimeOfDay', 'WeekOfYear', 'UserAvgSolvingTime', 'CumulativeItemCount', 'Item_last7days', 'Item_last30days', 'PastItemCount', 'CumulativeUserItemAnswerRate', 'ItemAnswerRate', 'AverageItemSolvingTime_Correct', 'AverageItemSolvingTime_Incorrect', 'AverageItemSolvingTime', 'Difference_SolvingTime_AvgItemSolvingTime', 'UserTagAvgSolvingTime', 'TagAnswerRate', 'CumulativeUserTagAverageAnswerRate', 'CumulativeUserTagExponentialAverage', 'UserCumulativeTagCount', 'UserRecentTagAnswer', 'PreviousItemAnswer', 'TestAnswerRate', 'categorize_solvingTime', 'categorize_ItemAnswerRate', 'categorize_TagAnswerRate', 'categorize_TestAnswerRate', 'categorize_CumulativeUserItemAnswerRate', 'categorize_CumulativeUserTagAverageAnswerRate', 'categorize_CumulativeUserTagExponentialAverage', 'user_test_correct_answer', 'user_test_tota

In [10]:
default_config = {
    "num_leaves": 10,  # 최소값 10
    "learning_rate": 0.0001,  # 최소값 0.0001
    "max_depth": -1,  # -1 (깊이 제한 없음)
    "min_data_in_leaf": 20,  # 최소값 20
    "feature_fraction": 0.6,  # 최소값 0.6
    "bagging_fraction": 0.6,  # 최소값 0.6
    "bagging_freq": 0,  # 최소값 0
    "lambda_l1": 0.0,  # 최소값 0.0
    "lambda_l2": 0.0,  # 최소값 0.0
    "cat_smooth": 10,  # 최소값 10
}



In [11]:

def train():
    
    auc = 0
    acc = 0
    test_preds = np.zeros(len(test))
    
    
    sampled_indices = X.groupby('userID').sample(frac=0.2).index

    # userID별 마지막 인덱스 찾기
    # last_indices = X.groupby("userID").tail(1).index

    # 학습 데이터셋 생성
    X_train = X.drop(sampled_indices)
    y_train = X_train["answerCode"]

    # 검증 데이터셋 생성
    X_valid = X.loc[sampled_indices]
    y_valid = X_valid["answerCode"]

    lgb_train = lgb.Dataset(X_train[filtered_feat], y_train)
    lgb_valid = lgb.Dataset(X_valid[filtered_feat], y_valid)

    wandb.init(project=f"lightgbm-sweep", config=default_config)
    wandb.run.name = f"nofoldlgbm"
    current_params = {
        "objective": "binary",
        "metric": ["auc"],
        "device": "cpu",
        "num_leaves": wandb.config.num_leaves,
        "learning_rate": wandb.config.learning_rate,
        "max_depth": wandb.config.max_depth,
        "min_data_in_leaf": wandb.config.min_data_in_leaf,
        "feature_fraction": wandb.config.feature_fraction,
        "bagging_fraction": wandb.config.bagging_fraction,
        "bagging_freq": wandb.config.bagging_freq,
        "lambda_l1": wandb.config.lambda_l1,
        "lambda_l2": wandb.config.lambda_l2,
        "cat_smooth": wandb.config.cat_smooth,
    }
    model = lgb.train(
        current_params,
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=500,
        callbacks=[
            wandb_callback(log_params=True, define_metric=True),
            lgb.early_stopping(30),
        ],
        categorical_feature=[
            "userID",
            "assessmentItemID",
            "testId",
            "KnowledgeTag",
            "Month"
        ],
    )
    preds = model.predict(X_valid[filtered_feat])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)
    test_preds += model.predict(test[filtered_feat])
    print(f"VALID AUC : {auc} ACC : {acc}\n")
    wandb.log({"auc": auc, "accuracy": acc})
    output_dir = "output/"
    write_path = os.path.join(
        output_dir,
        f"auc:{auc} acc:{acc}" + "sweep" + " lgbm.csv",
    )
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(write_path, "w", encoding="utf8") as w:
        print("writing prediction : {}".format(write_path))
        w.write("id,prediction\n")
        for id, p in enumerate(test_preds):
            w.write("{},{}\n".format(id, p))
            
    feature_importances = model.feature_importance()
    feature_names = model.feature_name()
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

    print(importance_df)

In [12]:

wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: loy20q50 with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.807402458327157
[34m[1mwandb[0m: 	bagging_freq: 22
[34m[1mwandb[0m: 	feature_fraction: 0.7646322674960475
[34m[1mwandb[0m: 	lambda_l1: 13.65236495769416
[34m[1mwandb[0m: 	lambda_l2: 8.223160188953639
[34m[1mwandb[0m: 	learning_rate: 0.34516061244560814
[34m[1mwandb[0m: 	max_depth: 35
[34m[1mwandb[0m: 	min_data_in_leaf: 39
[34m[1mwandb[0m: 	num_leaves: 18
[34m[1mwandb[0m: Currently logged in as: [33mwooksbaby[0m ([33mboostcamp6-recsys6[0m). Use [1m`wandb login --relogin`[0m to force relogin


[LightGBM] [Info] Number of positive: 1649973, number of negative: 868541
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22915
[LightGBM] [Info] Number of data points in the train set: 2518514, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655138 -> initscore=0.641699
[LightGBM] [Info] Start training from score 0.641699




[1]	training's auc: 0.813461	valid_1's auc: 0.786163
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.827477	valid_1's auc: 0.799917
[3]	training's auc: 0.832455	valid_1's auc: 0.804357
[4]	training's auc: 0.834932	valid_1's auc: 0.805964
[5]	training's auc: 0.837546	valid_1's auc: 0.808406
[6]	training's auc: 0.839662	valid_1's auc: 0.811545
[7]	training's auc: 0.841548	valid_1's auc: 0.813097
[8]	training's auc: 0.843117	valid_1's auc: 0.814675
[9]	training's auc: 0.844323	valid_1's auc: 0.816249
[10]	training's auc: 0.84576	valid_1's auc: 0.818031
[11]	training's auc: 0.846818	valid_1's auc: 0.819486
[12]	training's auc: 0.847811	valid_1's auc: 0.819863
[13]	training's auc: 0.848853	valid_1's auc: 0.820578
[14]	training's auc: 0.84951	valid_1's auc: 0.821385
[15]	training's auc: 0.850209	valid_1's auc: 0.822012
[16]	training's auc: 0.851032	valid_1's auc: 0.823234
[17]	training's auc: 0.851589	valid_1's auc: 0.823445
[18]	training's auc: 0.852036	va

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
valid_1_auc,▁▂▄▅▆▇▇▇▇▇▇█▇▇██████████████████████████

0,1
accuracy,0.75343
auc,0.8317
iteration,155.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: pvjx05z3 with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.883457554593565
[34m[1mwandb[0m: 	bagging_freq: 19
[34m[1mwandb[0m: 	feature_fraction: 0.7186418128523652
[34m[1mwandb[0m: 	lambda_l1: 0.7503680184599953
[34m[1mwandb[0m: 	lambda_l2: 13.979583178643816
[34m[1mwandb[0m: 	learning_rate: 0.6881418116240974
[34m[1mwandb[0m: 	max_depth: 25
[34m[1mwandb[0m: 	min_data_in_leaf: 114
[34m[1mwandb[0m: 	num_leaves: 17


[LightGBM] [Info] Number of positive: 1649973, number of negative: 868541
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22915
[LightGBM] [Info] Number of data points in the train set: 2518514, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655138 -> initscore=0.641699
[LightGBM] [Info] Start training from score 0.641699




[1]	training's auc: 0.813055	valid_1's auc: 0.785457
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.828391	valid_1's auc: 0.79949
[3]	training's auc: 0.834867	valid_1's auc: 0.806275
[4]	training's auc: 0.838102	valid_1's auc: 0.809981
[5]	training's auc: 0.841222	valid_1's auc: 0.813533
[6]	training's auc: 0.843423	valid_1's auc: 0.817168
[7]	training's auc: 0.845063	valid_1's auc: 0.818856
[8]	training's auc: 0.84631	valid_1's auc: 0.820293
[9]	training's auc: 0.847176	valid_1's auc: 0.820744
[10]	training's auc: 0.848341	valid_1's auc: 0.82145
[11]	training's auc: 0.849122	valid_1's auc: 0.821552
[12]	training's auc: 0.850056	valid_1's auc: 0.823827
[13]	training's auc: 0.8506	valid_1's auc: 0.82361
[14]	training's auc: 0.851451	valid_1's auc: 0.824759
[15]	training's auc: 0.852113	valid_1's auc: 0.825514
[16]	training's auc: 0.852649	valid_1's auc: 0.825692
[17]	training's auc: 0.853165	valid_1's auc: 0.825805
[18]	training's auc: 0.85373	valid_1

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▂▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█████████
valid_1_auc,▁▃▆▆▇▇▇▇██▇▇▇███████████████████████████

0,1
accuracy,0.75195
auc,0.83056
iteration,143.0


[34m[1mwandb[0m: Agent Starting Run: qt2htesv with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.5661185198222595
[34m[1mwandb[0m: 	bagging_freq: 25
[34m[1mwandb[0m: 	feature_fraction: 0.5234489682650816
[34m[1mwandb[0m: 	lambda_l1: 12.234416177398662
[34m[1mwandb[0m: 	lambda_l2: 5.535617794968749
[34m[1mwandb[0m: 	learning_rate: 0.3015246483438214
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	min_data_in_leaf: 103
[34m[1mwandb[0m: 	num_leaves: 11


[LightGBM] [Info] Number of positive: 1649973, number of negative: 868541
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22915
[LightGBM] [Info] Number of data points in the train set: 2518514, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655138 -> initscore=0.641699
[LightGBM] [Info] Start training from score 0.641699




[1]	training's auc: 0.806573	valid_1's auc: 0.780369
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.818366	valid_1's auc: 0.789881
[3]	training's auc: 0.824292	valid_1's auc: 0.795286
[4]	training's auc: 0.828024	valid_1's auc: 0.8008
[5]	training's auc: 0.829806	valid_1's auc: 0.8012
[6]	training's auc: 0.832001	valid_1's auc: 0.802942
[7]	training's auc: 0.834935	valid_1's auc: 0.805881
[8]	training's auc: 0.836594	valid_1's auc: 0.807451
[9]	training's auc: 0.838011	valid_1's auc: 0.809664
[10]	training's auc: 0.839586	valid_1's auc: 0.811088
[11]	training's auc: 0.840902	valid_1's auc: 0.812352
[12]	training's auc: 0.841777	valid_1's auc: 0.81331
[13]	training's auc: 0.842799	valid_1's auc: 0.81428
[14]	training's auc: 0.843521	valid_1's auc: 0.815229
[15]	training's auc: 0.844178	valid_1's auc: 0.816168
[16]	training's auc: 0.844921	valid_1's auc: 0.816527
[17]	training's auc: 0.845588	valid_1's auc: 0.81683
[18]	training's auc: 0.846035	valid_1

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████████████
valid_1_auc,▁▃▄▅▆▆▇▇▇▇▇▇████████████████████████████

0,1
accuracy,0.74966
auc,0.82702
iteration,154.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: va55dbye with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.77420843971161
[34m[1mwandb[0m: 	bagging_freq: 14
[34m[1mwandb[0m: 	feature_fraction: 0.7547317599963481
[34m[1mwandb[0m: 	lambda_l1: 4.724867891176504
[34m[1mwandb[0m: 	lambda_l2: 10.2520904186045
[34m[1mwandb[0m: 	learning_rate: 0.2284576852032626
[34m[1mwandb[0m: 	max_depth: 34
[34m[1mwandb[0m: 	min_data_in_leaf: 96
[34m[1mwandb[0m: 	num_leaves: 20


[LightGBM] [Info] Number of positive: 1649973, number of negative: 868541
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22915
[LightGBM] [Info] Number of data points in the train set: 2518514, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655138 -> initscore=0.641699
[LightGBM] [Info] Start training from score 0.641699




[1]	training's auc: 0.814722	valid_1's auc: 0.787957
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.825002	valid_1's auc: 0.798518
[3]	training's auc: 0.830336	valid_1's auc: 0.804095
[4]	training's auc: 0.833041	valid_1's auc: 0.806398
[5]	training's auc: 0.834823	valid_1's auc: 0.807946
[6]	training's auc: 0.836846	valid_1's auc: 0.808785
[7]	training's auc: 0.838431	valid_1's auc: 0.811297
[8]	training's auc: 0.839971	valid_1's auc: 0.812761
[9]	training's auc: 0.841098	valid_1's auc: 0.81383
[10]	training's auc: 0.842084	valid_1's auc: 0.814985
[11]	training's auc: 0.843365	valid_1's auc: 0.816304
[12]	training's auc: 0.844358	valid_1's auc: 0.816681
[13]	training's auc: 0.845132	valid_1's auc: 0.817477
[14]	training's auc: 0.845787	valid_1's auc: 0.818153
[15]	training's auc: 0.8465	valid_1's auc: 0.819279
[16]	training's auc: 0.847333	valid_1's auc: 0.820094
[17]	training's auc: 0.848138	valid_1's auc: 0.820446
[18]	training's auc: 0.848695	val

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇██████████
valid_1_auc,▁▃▄▅▅▆▆▇▇▇▇▇▇███████████████████████████

0,1
accuracy,0.74966
auc,0.83072
iteration,127.0


[34m[1mwandb[0m: Agent Starting Run: zspcodcb with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.8295399869384589
[34m[1mwandb[0m: 	bagging_freq: 21
[34m[1mwandb[0m: 	feature_fraction: 0.70535551115946
[34m[1mwandb[0m: 	lambda_l1: 12.531194153489814
[34m[1mwandb[0m: 	lambda_l2: 8.658321331563497
[34m[1mwandb[0m: 	learning_rate: 0.3039082768650463
[34m[1mwandb[0m: 	max_depth: 29
[34m[1mwandb[0m: 	min_data_in_leaf: 59
[34m[1mwandb[0m: 	num_leaves: 32


[LightGBM] [Info] Number of positive: 1649973, number of negative: 868541
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22915
[LightGBM] [Info] Number of data points in the train set: 2518514, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655138 -> initscore=0.641699
[LightGBM] [Info] Start training from score 0.641699




[1]	training's auc: 0.821851	valid_1's auc: 0.796783
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.832469	valid_1's auc: 0.805882
[3]	training's auc: 0.836345	valid_1's auc: 0.810517
[4]	training's auc: 0.839232	valid_1's auc: 0.813473
[5]	training's auc: 0.841465	valid_1's auc: 0.815367
[6]	training's auc: 0.843428	valid_1's auc: 0.816829
[7]	training's auc: 0.845033	valid_1's auc: 0.817848
[8]	training's auc: 0.846351	valid_1's auc: 0.819192
[9]	training's auc: 0.847654	valid_1's auc: 0.821466
[10]	training's auc: 0.848839	valid_1's auc: 0.822122
[11]	training's auc: 0.849878	valid_1's auc: 0.822942
[12]	training's auc: 0.85081	valid_1's auc: 0.823985
[13]	training's auc: 0.851852	valid_1's auc: 0.824551
[14]	training's auc: 0.852582	valid_1's auc: 0.825315
[15]	training's auc: 0.853347	valid_1's auc: 0.825742
[16]	training's auc: 0.854251	valid_1's auc: 0.826232
[17]	training's auc: 0.854874	valid_1's auc: 0.826433
[18]	training's auc: 0.855428	v

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▃▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█████████
valid_1_auc,▁▄▅▅▆▆▆▇▇▇▇▇▇███████████████████████████

0,1
accuracy,0.75544
auc,0.83369
iteration,87.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: eyikrwei with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.7775070139382719
[34m[1mwandb[0m: 	bagging_freq: 26
[34m[1mwandb[0m: 	feature_fraction: 0.7682180828927625
[34m[1mwandb[0m: 	lambda_l1: 9.63824499679918
[34m[1mwandb[0m: 	lambda_l2: 7.885569128871165
[34m[1mwandb[0m: 	learning_rate: 0.25445130707270025
[34m[1mwandb[0m: 	max_depth: 32
[34m[1mwandb[0m: 	min_data_in_leaf: 55
[34m[1mwandb[0m: 	num_leaves: 31


[LightGBM] [Info] Number of positive: 1649973, number of negative: 868541
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22915
[LightGBM] [Info] Number of data points in the train set: 2518514, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655138 -> initscore=0.641699
[LightGBM] [Info] Start training from score 0.641699




[1]	training's auc: 0.822877	valid_1's auc: 0.799112
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.831549	valid_1's auc: 0.805722
[3]	training's auc: 0.835549	valid_1's auc: 0.809371
[4]	training's auc: 0.83766	valid_1's auc: 0.811941
[5]	training's auc: 0.839952	valid_1's auc: 0.814244
[6]	training's auc: 0.841753	valid_1's auc: 0.815416
[7]	training's auc: 0.843371	valid_1's auc: 0.816684
[8]	training's auc: 0.844696	valid_1's auc: 0.818588
[9]	training's auc: 0.845711	valid_1's auc: 0.8204
[10]	training's auc: 0.846893	valid_1's auc: 0.820941
[11]	training's auc: 0.848034	valid_1's auc: 0.82162
[12]	training's auc: 0.849112	valid_1's auc: 0.82217
[13]	training's auc: 0.849931	valid_1's auc: 0.822708
[14]	training's auc: 0.85077	valid_1's auc: 0.823421
[15]	training's auc: 0.85151	valid_1's auc: 0.824276
[16]	training's auc: 0.852308	valid_1's auc: 0.82457
[17]	training's auc: 0.853006	valid_1's auc: 0.825141
[18]	training's auc: 0.853554	valid_1'