In [1]:
import os,sys,gc

from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from glob import glob
import pickle

from datetime import timedelta

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')

#Logging
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('logs.log')

formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)



# Historyからitem popularity

In [3]:
#PATH
TRAIN_DIR = Path('/home/data/train_large')
TEST_DIR = Path('/home/data/ebnerd_testset')

trn_history = pl.read_parquet(TRAIN_DIR/'train'/'history.parquet')
val_history = pl.read_parquet(TRAIN_DIR/'validation'/'history.parquet')
test_history = pl.read_parquet(TEST_DIR/'test'/'history.parquet')

trn_behaviors = pl.read_parquet(TRAIN_DIR/'train'/'behaviors.parquet')
val_behaviors = pl.read_parquet(TRAIN_DIR/'validation'/'behaviors.parquet')
test_behaviors = pl.read_parquet(TEST_DIR/'test'/'behaviors.parquet')

In [4]:
df_history = trn_history.clone()
df_behaviors = trn_behaviors.clone()

In [5]:
df_user = df_behaviors.select(['user_id','is_sso_user','gender','postcode','age','is_subscriber']).unique()

In [6]:
df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']
df_history = df_history.fill_null(0)

### df_behaviorsのarticle_ids_inviewベースで作成

In [9]:
def get_article_ids_inview_pop(
        df_behaviors,
        time_interval,
        ):
    df = df_behaviors.explode("article_ids_inview").select(
    ['impression_id','impression_time','article_ids_inview'])

    if time_interval:
        df = df.with_columns(
            pl.col("impression_time").dt.truncate(time_interval).alias("rounded_datetime")
        )
        df_pop = df.groupby(['article_ids_inview','rounded_datetime']).agg(
            pl.count("impression_id").alias("count")
        )
        df_pop.columns = ['article_id',f'rounded_{time_interval}_datetime',f'rounded_{time_interval}_inview_count']

    return df_pop

In [10]:
df_dict ={
    'train':trn_behaviors,
    'valid':val_behaviors,
    'test':test_behaviors,
}

In [11]:
#df_articleを保存
dir_path = '/home/data/article_pop_inview'

time_interval_list = ['1m','2m','3m','5m','10m','15m','20m','30m','1h','2h','3h','6h','12h','24h']

for df_name,df_history in df_dict.items():
    for time_interval in time_interval_list:
        df_article = get_article_ids_inview_pop(df_history,time_interval)
        df_article.write_parquet(f'{dir_path}/{df_name}_article_pop_inview_{time_interval}.parquet')

        logger.info(f'time_interval:{time_interval} / shape:{df_article.shape}')

[2024-06-07 08:55:18,148][INFO] time_interval:15s / shape:(10759132, 3)
[2024-06-07 08:55:34,138][INFO] time_interval:30s / shape:(7154899, 3)
[2024-06-07 08:55:54,671][INFO] time_interval:15s / shape:(13196740, 3)
[2024-06-07 08:56:13,575][INFO] time_interval:30s / shape:(8740934, 3)
[2024-06-07 08:56:33,910][INFO] time_interval:15s / shape:(13533909, 3)
[2024-06-07 08:56:52,293][INFO] time_interval:30s / shape:(8948062, 3)


In [12]:
df_article.tail()

article_id,rounded_30s_datetime,rounded_30s_inview_count
i32,datetime[μs],u32
9792856,2023-06-01 07:00:00,200000
9791766,2023-06-01 07:00:00,200000
9793106,2023-06-01 07:00:00,200000
9793541,2023-06-01 07:00:00,200000
9791788,2023-06-01 07:00:00,200000
