In [1]:
import warnings 
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import os
import random
import numpy as np
import yaml
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
import torch
import pytz
from datetime import datetime, timezone, timedelta
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from wandb.lightgbm import wandb_callback, log_summary

In [3]:
#wandb_callback 수정 
from typing import TYPE_CHECKING, Callable
import wandb
from wandb.sdk.lib import telemetry as wb_telemetry

MINIMIZE_METRICS = [
    "l1",
    "l2",
    "rmse",
    "mape",
    "huber",
    "fair",
    "poisson",
    "gamma",
    "binary_logloss",
]

MAXIMIZE_METRICS = ["map", "auc", "average_precision"]

def set_seeds(seed: int = 42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
        
def wandb_callback(log_params=True, define_metric=True) -> Callable:
    """Automatically integrates LightGBM with wandb.

    Arguments:
        log_params: (boolean) if True (default) logs params passed to lightgbm.train as W&B config
        define_metric: (boolean) if True (default) capture model performance at the best step, instead of the last step, of training in your `wandb.summary`

    Passing `wandb_callback` to LightGBM will:
      - log params passed to lightgbm.train as W&B config (default).
      - log evaluation metrics collected by LightGBM, such as rmse, accuracy etc to Weights & Biases
      - Capture the best metric in `wandb.summary` when `define_metric=True` (default).

    Use `log_summary` as an extension of this callback.

    Example:
        ```python
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            .
        }
        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=10,
                        valid_sets=lgb_eval,
                        valid_names=('validation'),
                        callbacks=[wandb_callback()])
        ```
    """
    def _define_metric(data: str, metric_name: str) -> None:
    
        """Capture model performance at the best step.
        instead of the last step, of training in your `wandb.summary`
        """
        if "loss" in str.lower(metric_name):
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MINIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="min")
        elif str.lower(metric_name) in MAXIMIZE_METRICS:
            wandb.define_metric(f"{data}_{metric_name}", summary="max")
            
    log_params_list: "List[bool]" = [log_params]
    define_metric_list: "List[bool]" = [define_metric]

    def _init(env: "CallbackEnv") -> None:
        with wb_telemetry.context() as tel:
            tel.feature.lightgbm_wandb_callback = True

        wandb.config.update(env.params)
        log_params_list[0] = False

        if define_metric_list[0]:
            for i in range(len(env.evaluation_result_list)):
                data_type = env.evaluation_result_list[i][0]
                metric_name = env.evaluation_result_list[i][1]
                _define_metric(data_type, metric_name)

    def _callback(env: "CallbackEnv") -> None:
        if log_params_list[0]:
            _init(env)
        # eval_results: "Dict[str, Dict[str, List[Any]]]" = {}
        # recorder = lightgbm.record_evaluation(eval_results)
        # recorder(env)
        eval_results = {x[0]:{x[1:][0]:x[1:][1:]} for x in env.evaluation_result_list}

        for validation_key in eval_results.keys():
            for key in eval_results[validation_key].keys():
                 wandb.log(
                     {validation_key + "_" + key: eval_results[validation_key][key][0]},
                     commit=False,
                 )
        for item in eval_results:
            if len(item) == 4:
                wandb.log({f"{item[0]}_{item[1]}": item[2]}, commit=False)

        # Previous log statements use commit=False. This commits them.
        wandb.log({"iteration": env.iteration}, commit=True)

    return _callback

## Training

In [4]:
sweep_config_path = '/data/ephemeral/level2-dkt-recsys-06/code/boost/lgbmsweepconfigv2.yaml'

# 노트북의 이름 설정

os.environ['WANDB_NOTEBOOK_NAME'] = 'LGBM_Train.ipynb'
# YAML 파일 로드
with open(sweep_config_path, 'r') as file:
    sweep_config = yaml.safe_load(file)

# W&B 스위프트 설정
sweep_id = wandb.sweep(sweep=sweep_config, project="lightgbm-sweep")

# 시드 고정
set_seeds()


Create sweep with ID: 4ay6thrs
Sweep URL: https://wandb.ai/boostcamp6-recsys6/lightgbm-sweep/sweeps/4ay6thrs


In [5]:
X = pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_v7.csv')
test =  pd.read_csv('/data/ephemeral/level2-dkt-recsys-06/data/FE_test_v7.csv')
X = X.sort_values(by=["userID", "Timestamp", "assessmentItemID"]).reset_index(drop=True)
test = test.sort_values(by=["userID", "Timestamp", "assessmentItemID"]).reset_index(drop=True)

# test = test[test["userID"] != test["userID"].shift(-1)]
# test = test.drop(["answerCode"], axis=1)

# 유저 아이디 갯수 구하기
unique_user_count = X['userID'].nunique()

# 유저아이디 갯수 출력
print(f'userIDnum {unique_user_count}')

# 유저아이디 마지막 행 구하기
last_rows = test.groupby('userID').tail(1)
print(last_rows)
# 마지막 행의 answerCode가 -1인지 확인
are_last_answers_minus_one = (last_rows['answerCode'] == -1).all()

print(are_last_answers_minus_one)

# test 모양
print(test.shape)



userIDnum 7442
        userID  assessmentItemID  testId  answerCode            Timestamp  \
1035         3          50133008   50133          -1  2020-10-26 13:13:57   
1706         4          70146008   70146          -1  2020-12-27 02:47:54   
3023        13          70111008   70111          -1  2020-12-27 04:35:09   
4283        17          90064006   90064          -1  2020-10-30 05:48:37   
4670        26          60135007   60135          -1  2020-10-23 11:44:18   
...        ...               ...     ...         ...                  ...   
260052    7395          40122005   40122          -1  2020-09-08 02:05:20   
260067    7404          30111005   30111          -1  2020-10-13 09:49:18   
260082    7416          50193004   50193          -1  2020-10-04 02:44:41   
260097    7417          50193004   50193          -1  2020-09-06 13:09:15   
260113    7439          40130005   40130          -1  2020-10-14 23:10:03   

        KnowledgeTag  Itemseq  SolvingTime  CumulativeTime  

In [6]:
test = test[test["answerCode"] == -1]
X = X[X['answerCode']!=-1]

In [7]:

# LabelEncoder 적용

label_encoders = {}
for column in [
    "categorize_ItemAnswerRate",
    "categorize_TagAnswerRate",
    "categorize_TestAnswerRate",
    "categorize_CumulativeUserItemAnswerRate",
    "categorize_CumulativeUserTagAverageAnswerRate",
    "categorize_CumulativeUserTagExponentialAverage",
    "DayOfWeek",
    "TimeOfDay",
    "categorize_TagAnswerRate",    
    "UserRecentTagAnswer",
    "PreviousItemAnswer",
    "categorize_TestAnswerRate",
]:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    # 테스트 데이터에 대해서는 transform만 적용
    test[column] = le.transform(test[column])
    label_encoders[column] = le


In [8]:
# print(X.shape)

# # 원-핫 인코딩 적용할 컬럼 선택
# columns_to_encode = [
#     "UserRecentTagAnswer",
#     "PreviousItemAnswer",
#     # 추가적으로 원-핫 인코딩을 적용할 다른 컬럼들을 여기에 추가
# ]
# for column in columns_to_encode:
#     if column in X.columns:
#         X = pd.get_dummies(X, columns=[column])
#     if column in test.columns:
#         test = pd.get_dummies(test, columns=[column])

#         # 다른 필드들에 대해서도 동일하게 적용
# print(X.shape)

# 라벨로 이동 


In [9]:
feat = X.columns.tolist()

exclude_columns = [
    "Timestamp",
    "answerCode",
    "DayOfWeek",
    'WeekOfYear',
    'UserAvgSolvingTime',
    'PastItemCount',
    "user_tag_total_answer",
    "categorize_CumulativeUserTagExponentialAverage",
    'categorize_CumulativeUserTagAverageAnswerRate',
    "categorize_TestAnswerRate",
    "categorize_TagAnswerRate"
]

filtered_feat = [column for column in feat if column not in exclude_columns]

print(X[feat].shape)
print(X[filtered_feat].shape)
print(X.columns.tolist())

(2525956, 54)
(2525956, 43)
['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'Itemseq', 'SolvingTime', 'CumulativeTime', 'Month', 'DayOfWeek', 'TimeOfDay', 'WeekOfYear', 'UserAvgSolvingTime', 'CumulativeItemCount', 'Item_last7days', 'Item_last30days', 'PastItemCount', 'CumulativeUserItemAnswerRate', 'ItemAnswerRate', 'AverageItemSolvingTime_Correct', 'AverageItemSolvingTime_Incorrect', 'AverageItemSolvingTime', 'Difference_SolvingTime_AvgItemSolvingTime', 'UserTagAvgSolvingTime', 'TagAnswerRate', 'CumulativeUserTagAverageAnswerRate', 'CumulativeUserTagExponentialAverage', 'UserCumulativeTagCount', 'UserRecentTagAnswer', 'PreviousItemAnswer', 'TestAnswerRate', 'categorize_solvingTime', 'categorize_ItemAnswerRate', 'categorize_TagAnswerRate', 'categorize_TestAnswerRate', 'categorize_CumulativeUserItemAnswerRate', 'categorize_CumulativeUserTagAverageAnswerRate', 'categorize_CumulativeUserTagExponentialAverage', 'user_test_correct_answer', 'user_test_tota

In [10]:
default_config = {
    "num_leaves": 10,  # 최소값 10
    "learning_rate": 0.0001,  # 최소값 0.0001
    "max_depth": -1,  # -1 (깊이 제한 없음)
    "min_data_in_leaf": 20,  # 최소값 20
    "feature_fraction": 0.6,  # 최소값 0.6
    "bagging_fraction": 0.6,  # 최소값 0.6
    "bagging_freq": 0,  # 최소값 0
    "lambda_l1": 0.0,  # 최소값 0.0
    "lambda_l2": 0.0,  # 최소값 0.0
    "cat_smooth": 10,  # 최소값 10
}



In [11]:

def train():
    
    auc = 0
    acc = 0
    test_preds = np.zeros(len(test))
    
    wandb.init(project=f"lightgbm-sweep", config=default_config)
    
    ratio = wandb.config.ratio
    
    sampled_indices = X.groupby('userID').sample(frac=ratio).index

    # userID별 마지막 인덱스 찾기
    # last_indices = X.groupby("userID").tail(1).index

    # 학습 데이터셋 생성
    X_train = X.drop(sampled_indices)
    y_train = X_train["answerCode"]

    # 검증 데이터셋 생성
    X_valid = X.loc[sampled_indices]
    y_valid = X_valid["answerCode"]

    lgb_train = lgb.Dataset(X_train[filtered_feat], y_train)
    lgb_valid = lgb.Dataset(X_valid[filtered_feat], y_valid)

    # 완드비 실험 이름
    korea = pytz.timezone("Asia/Seoul")
    current_time = datetime.now(korea).strftime("%m-%d %H:%M")
    wandb.run.name = f"wooksbaby-{current_time},ratio-{ratio}"
    current_params = {
        "objective": "binary",
        "metric": ["auc"],
        "device": "cpu",
        "num_leaves": wandb.config.num_leaves,
        "learning_rate": wandb.config.learning_rate,
        "max_depth": wandb.config.max_depth,
        "min_data_in_leaf": wandb.config.min_data_in_leaf,
        "feature_fraction": wandb.config.feature_fraction,
        "bagging_fraction": wandb.config.bagging_fraction,
        "bagging_freq": wandb.config.bagging_freq,
        "lambda_l1": wandb.config.lambda_l1,
        "lambda_l2": wandb.config.lambda_l2,
        "cat_smooth": wandb.config.cat_smooth,
    }
    model = lgb.train(
        current_params,
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        num_boost_round=500,
        callbacks=[
            wandb_callback(log_params=True, define_metric=True),
            lgb.early_stopping(30),
        ],
        categorical_feature=[
            "userID",
            "assessmentItemID",
            "testId",
            "KnowledgeTag",
            "Month"
        ],
    )
    preds = model.predict(X_valid[filtered_feat])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)
    test_preds += model.predict(test[filtered_feat])
    print(f"VALID AUC : {auc} ACC : {acc}\n")
    wandb.log({"auc": auc, "accuracy": acc})
    wandb.finish()
    
    #output파일 생성
    output_dir = "output/"
    write_path = os.path.join(
        output_dir,
        f"auc:{auc} acc:{acc}" + "sweep" + " lgbm.csv",
    )
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(write_path, "w", encoding="utf8") as w:
        print("writing prediction : {}".format(write_path))
        w.write("id,prediction\n")
        for id, p in enumerate(test_preds):
            w.write("{},{}\n".format(id, p))
            
    feature_importances = model.feature_importance()
    feature_names = model.feature_name()
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

    print(importance_df)

In [12]:

wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: pysm9n60 with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.8590183440130919
[34m[1mwandb[0m: 	bagging_freq: 14
[34m[1mwandb[0m: 	feature_fraction: 0.9091790175585632
[34m[1mwandb[0m: 	lambda_l1: 14.388011892553417
[34m[1mwandb[0m: 	lambda_l2: 2.295435591582504
[34m[1mwandb[0m: 	learning_rate: 0.2753057279659792
[34m[1mwandb[0m: 	max_depth: 29
[34m[1mwandb[0m: 	min_data_in_leaf: 110
[34m[1mwandb[0m: 	num_leaves: 18
[34m[1mwandb[0m: 	ratio: 0.11367092928384603
[34m[1mwandb[0m: Currently logged in as: [33mwooksbaby[0m ([33mboostcamp6-recsys6[0m). Use [1m`wandb login --relogin`[0m to force relogin


[LightGBM] [Info] Number of positive: 1465755, number of negative: 773000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22953
[LightGBM] [Info] Number of data points in the train set: 2238755, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654719 -> initscore=0.639847
[LightGBM] [Info] Start training from score 0.639847




[1]	training's auc: 0.813195	valid_1's auc: 0.813006
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.826235	valid_1's auc: 0.826034
[3]	training's auc: 0.830708	valid_1's auc: 0.830488
[4]	training's auc: 0.833194	valid_1's auc: 0.833024
[5]	training's auc: 0.835826	valid_1's auc: 0.835683
[6]	training's auc: 0.837904	valid_1's auc: 0.837707
[7]	training's auc: 0.839474	valid_1's auc: 0.839212
[8]	training's auc: 0.840966	valid_1's auc: 0.840536
[9]	training's auc: 0.842415	valid_1's auc: 0.841935
[10]	training's auc: 0.843598	valid_1's auc: 0.843104
[11]	training's auc: 0.844473	valid_1's auc: 0.843879
[12]	training's auc: 0.845587	valid_1's auc: 0.844908
[13]	training's auc: 0.846559	valid_1's auc: 0.845702
[14]	training's auc: 0.847436	valid_1's auc: 0.846451
[15]	training's auc: 0.848293	valid_1's auc: 0.84719
[16]	training's auc: 0.84901	valid_1's auc: 0.84781
[17]	training's auc: 0.84967	valid_1's auc: 0.848369
[18]	training's auc: 0.850173	vali

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████
valid_1_auc,▁▄▆▆▇▇▇▇▇▇▇█████████████████████████████

0,1
accuracy,0.80387
auc,0.85908
iteration,398.0


writing prediction : output/auc:0.8590763725985315 acc:0.8038725491902884sweep lgbm.csv
                                      Feature  Importance
0                                      userID        2319
1                            assessmentItemID        1858
3                                KnowledgeTag         772
2                                      testId         750
35                                problem_acc          60
40                          recent_solve_time          59
42                                   user_acc          47
13                             ItemAnswerRate          40
18                      UserTagAvgSolvingTime          31
5                                 SolvingTime          30
31                              user_test_acc          30
15           AverageItemSolvingTime_Incorrect          30
6                              CumulativeTime          27
17  Difference_SolvingTime_AvgItemSolvingTime          23
21        CumulativeUserTagExponentialAver

[34m[1mwandb[0m: Agent Starting Run: uqkovvi5 with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.8649380801329217
[34m[1mwandb[0m: 	bagging_freq: 29
[34m[1mwandb[0m: 	feature_fraction: 0.9596953000128304
[34m[1mwandb[0m: 	lambda_l1: 14.540770938261469
[34m[1mwandb[0m: 	lambda_l2: 13.571888509427456
[34m[1mwandb[0m: 	learning_rate: 0.5471107244520856
[34m[1mwandb[0m: 	max_depth: 41
[34m[1mwandb[0m: 	min_data_in_leaf: 23
[34m[1mwandb[0m: 	num_leaves: 41
[34m[1mwandb[0m: 	ratio: 0.139808153679805


[LightGBM] [Info] Number of positive: 1421857, number of negative: 750968
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22991
[LightGBM] [Info] Number of data points in the train set: 2172825, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654382 -> initscore=0.638356
[LightGBM] [Info] Start training from score 0.638356




[1]	training's auc: 0.826186	valid_1's auc: 0.826109
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.836537	valid_1's auc: 0.836113
[3]	training's auc: 0.841397	valid_1's auc: 0.840667
[4]	training's auc: 0.844553	valid_1's auc: 0.843236
[5]	training's auc: 0.847245	valid_1's auc: 0.845331
[6]	training's auc: 0.849578	valid_1's auc: 0.847085
[7]	training's auc: 0.851857	valid_1's auc: 0.848685
[8]	training's auc: 0.853664	valid_1's auc: 0.849529
[9]	training's auc: 0.855342	valid_1's auc: 0.850459
[10]	training's auc: 0.85687	valid_1's auc: 0.851282
[11]	training's auc: 0.858035	valid_1's auc: 0.851812
[12]	training's auc: 0.859102	valid_1's auc: 0.852215
[13]	training's auc: 0.859991	valid_1's auc: 0.85249
[14]	training's auc: 0.861314	valid_1's auc: 0.853105
[15]	training's auc: 0.862033	valid_1's auc: 0.853257
[16]	training's auc: 0.863023	valid_1's auc: 0.853664
[17]	training's auc: 0.86363	valid_1's auc: 0.853888
[18]	training's auc: 0.864382	val

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇████████
valid_1_auc,▁▄▅▆▇▇▇▇████████████████████████████████

0,1
accuracy,0.80257
auc,0.85623
iteration,84.0


writing prediction : output/auc:0.8562330592449632 acc:0.8025718501066177sweep lgbm.csv
                                      Feature  Importance
0                                      userID         849
1                            assessmentItemID         702
2                                      testId         201
3                                KnowledgeTag         148
35                                problem_acc          38
40                          recent_solve_time          36
13                             ItemAnswerRate          21
15           AverageItemSolvingTime_Incorrect          19
42                                   user_acc          19
18                      UserTagAvgSolvingTime          16
5                                 SolvingTime          15
25                             TestAnswerRate          14
6                              CumulativeTime          13
31                              user_test_acc          13
37                                    elap

[34m[1mwandb[0m: Agent Starting Run: hf70ei5b with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.9006621858279473
[34m[1mwandb[0m: 	bagging_freq: 28
[34m[1mwandb[0m: 	feature_fraction: 0.9669314197414324
[34m[1mwandb[0m: 	lambda_l1: 2.170503584635073
[34m[1mwandb[0m: 	lambda_l2: 12.42916777156044
[34m[1mwandb[0m: 	learning_rate: 0.35664769709991456
[34m[1mwandb[0m: 	max_depth: 30
[34m[1mwandb[0m: 	min_data_in_leaf: 120
[34m[1mwandb[0m: 	num_leaves: 41
[34m[1mwandb[0m: 	ratio: 0.06320421546377779


[LightGBM] [Info] Number of positive: 1549180, number of negative: 817165
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22951
[LightGBM] [Info] Number of data points in the train set: 2366345, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.654672 -> initscore=0.639640
[LightGBM] [Info] Start training from score 0.639640




[1]	training's auc: 0.826356	valid_1's auc: 0.826065
Training until validation scores don't improve for 30 rounds
[2]	training's auc: 0.835137	valid_1's auc: 0.834804
[3]	training's auc: 0.839561	valid_1's auc: 0.839304
[4]	training's auc: 0.842181	valid_1's auc: 0.841464
[5]	training's auc: 0.844401	valid_1's auc: 0.843389
[6]	training's auc: 0.846676	valid_1's auc: 0.845254
[7]	training's auc: 0.848516	valid_1's auc: 0.846638
[8]	training's auc: 0.850096	valid_1's auc: 0.847785
[9]	training's auc: 0.851727	valid_1's auc: 0.848964
[10]	training's auc: 0.853133	valid_1's auc: 0.849813
[11]	training's auc: 0.854559	valid_1's auc: 0.850727
[12]	training's auc: 0.855704	valid_1's auc: 0.851415
[13]	training's auc: 0.856899	valid_1's auc: 0.85197
[14]	training's auc: 0.857943	valid_1's auc: 0.852371
[15]	training's auc: 0.858868	valid_1's auc: 0.852799
[16]	training's auc: 0.859763	valid_1's auc: 0.853246
[17]	training's auc: 0.860754	valid_1's auc: 0.853819
[18]	training's auc: 0.861404	v

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
auc,▁
iteration,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
training_auc,▁▂▃▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
valid_1_auc,▁▄▅▆▇▇▇▇▇▇▇█████████████████████████████

0,1
accuracy,0.80426
auc,0.85822
iteration,113.0


writing prediction : output/auc:0.8582179561242793 acc:0.8042616110418455sweep lgbm.csv
                                      Feature  Importance
0                                      userID        1314
1                            assessmentItemID        1153
2                                      testId         256
3                                KnowledgeTag         213
35                                problem_acc          65
40                          recent_solve_time          46
5                                 SolvingTime          29
42                                   user_acc          26
13                             ItemAnswerRate          25
15           AverageItemSolvingTime_Incorrect          25
25                             TestAnswerRate          24
37                                    elapsed          22
18                      UserTagAvgSolvingTime          22
31                              user_test_acc          17
6                              CumulativeT

[34m[1mwandb[0m: Agent Starting Run: 8izdcczn with config:
[34m[1mwandb[0m: 	bagging_fraction: 0.7573033971584426
[34m[1mwandb[0m: 	bagging_freq: 7
[34m[1mwandb[0m: 	feature_fraction: 0.7154429036676966
[34m[1mwandb[0m: 	lambda_l1: 11.6360436223156
[34m[1mwandb[0m: 	lambda_l2: 14.459760395540512
[34m[1mwandb[0m: 	learning_rate: 0.18472126555391316
[34m[1mwandb[0m: 	max_depth: 23
[34m[1mwandb[0m: 	min_data_in_leaf: 84
[34m[1mwandb[0m: 	num_leaves: 22
[34m[1mwandb[0m: 	ratio: 0.3816168125241584


[LightGBM] [Info] Number of positive: 1022151, number of negative: 539827
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 22982
[LightGBM] [Info] Number of data points in the train set: 1561978, number of used features: 43
