In [1]:
import os

import pandas as pd
import polars as pl


import lightgbm as lgb
import matplotlib.pyplot as plt
import random
import seaborn as sns

from tqdm import tqdm

import warnings

warnings.filterwarnings('ignore')

#Logging
import logging

# ロガーの取得
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('logs.log')

# # ログのフォーマットを設定
formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Notebook上にもログを表示するためのハンドラ
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

pd.set_option('display.max_columns', 1000)

# Data Load

In [None]:
chunks_to_train_on = 20


chunk_files = [os.path.join("./trn_large_chunks/", path) for path in os.listdir("./trn_large_chunks/")] + \
                            [os.path.join("./val_large_chunks/", path) for path in os.listdir("./val_large_chunks/")]

random.seed(749812)
random_files = random.sample(chunk_files, chunks_to_train_on)
print(random_files)

trn_dfs = []
for file in tqdm(random_files):
    trn_dfs.append(pl.read_parquet(file))

trn_df = pl.concat(trn_dfs)

del trn_dfs
import gc; gc.collect()

# Train

In [35]:
target_col = 'is_clicked'
drop_cols = [
    'impression_id', 'impression_time','user_id','session_id','article_id',
]

In [36]:
#drop_colsを落としたものを特徴量として使う。pandasにする
X_train = trn_df.drop(drop_cols + [target_col]).to_pandas()
y_train = trn_df[target_col].to_pandas()

imp_train = trn_df['impression_id'].to_pandas()

logger.info(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')

X_train["category"] = X_train["category"].astype("category")

[2024-06-17 15:18:11,930][INFO] X_train shape: (27954646, 436), y_train shape: (27954646,)


In [37]:
# スコア計算用
from sklearn.metrics import roc_auc_score
def get_mean_auc(y_pred,y_true,impression):
    #rocをimpressionごとに計算して平均する
    pred_df = pd.DataFrame({
        'impression_id': impression,
        'y_true': y_true,
        'y_pred': y_pred
    })

    #save as valid_pred.parquet
    pred_df.to_parquet('valid_pred.parquet')

    group_auc = pred_df.groupby('impression_id').apply(lambda x: roc_auc_score(x['y_true'],x['y_pred'])).mean()

    return group_auc

In [38]:
del trn_df
import gc; gc.collect()

42

## LightGBM

In [39]:
lgb_params ={
    'boosting_type': 'gbdt',
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at':5,
    'learning_rate': 0.01,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'random_state': 758392,
    'n_jobs': 20,
}

In [None]:

callbacks = [
    lgb.log_evaluation(period=100)
]

group_train = imp_train.groupby(imp_train).count().to_numpy()

lgb_train = lgb.Dataset(X_train, label=y_train,group=group_train)

lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=18000,
    callbacks=callbacks
)


#feature importance
feature_importance = lgb_model.feature_importance(importance_type='gain')
feature_name = lgb_model.feature_name()

feature_importance_df = pd.DataFrame({
    'feature_name': feature_name,
    'importance': feature_importance
})

feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

#plot
plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importance_df.head(50), x='importance', y='feature_name')
plt.show()

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 102738
[LightGBM] [Info] Number of data points in the train set: 27954646, number of used features: 436


In [None]:
lgb_model.save_model("lgb_model_20p_1.txt")