In [1]:
import os,sys,gc

from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from glob import glob
import pickle

from datetime import timedelta

import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn.functional as F
from typing import Iterable
from tqdm import tqdm

import warnings
import zipfile

warnings.filterwarnings('ignore')

#Logging
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('logs.log')

formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# Data Load

In [4]:
#PATH
TRAIN_DIR = Path('/home/data/train_large')
TEST_DIR = Path('/home/data/ebnerd_testset')

In [5]:
trn_behaviors = pl.read_parquet(TRAIN_DIR/'train'/'behaviors.parquet')
#trn_history = pl.read_parquet(TRAIN_DIR/'train'/'history.parquet')

val_behaviors = pl.read_parquet(TRAIN_DIR/'validation'/'behaviors.parquet')
#val_history = pl.read_parquet(TRAIN_DIR/'validation'/'history.parquet')
articles = pl.read_parquet('/home/data/ebnerd_testset/articles.parquet')

test_behaviors = pl.read_parquet(TEST_DIR/'test'/'behaviors.parquet')

#test_history = pl.read_parquet(TEST_DIR/'test'/'history.parquet')

In [6]:
test_behaviors = test_behaviors.filter(pl.col('impression_id') != 0)

In [7]:
test_behaviors

impression_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,is_beyond_accuracy
u32,datetime[μs],f32,f32,i8,list[i32],u32,bool,i8,i8,i8,bool,u32,bool
6451339,2023-06-05 15:02:49,8.0,,2,"[9796527, 7851321, … 9492777]",35982,false,,,,false,388,false
6451363,2023-06-05 15:03:56,20.0,,2,"[9798532, 9791602, … 9798958]",36012,false,,,,false,804,false
6451382,2023-06-05 15:25:53,9.0,,2,"[9798498, 9793856, … 9798724]",36162,false,,,,false,1528,false
6451383,2023-06-05 15:26:35,14.0,,2,"[9797419, 9798829, … 9798805]",36162,false,,,,false,1528,false
6451385,2023-06-05 15:26:14,8.0,,2,"[9785014, 9798958, … 9486080]",36162,false,,,,false,1528,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…
575652816,2023-06-04 19:16:46,25.0,100.0,2,"[9793567, 9794195, … 9797733]",36195,false,,,,false,41708428,false
575652848,2023-06-04 19:33:20,268.0,100.0,1,"[9797347, 9794206, … 9795808]",36254,false,,,,false,76037742,false
575652849,2023-06-04 19:48:47,16.0,100.0,1,"[9781913, 9763188, … 9523718]",36254,false,,,,false,76037742,false
575652850,2023-06-04 19:38:13,22.0,100.0,1,"[9792714, 9791165, … 9486080]",36254,false,,,,false,76037742,false


# Inviewベースでよくある組み合わせ

In [10]:
dict_df_behaviors = {'train': trn_behaviors, 'valid': val_behaviors}

#dict_df_behaviors = {'test': test_behaviors}

In [11]:
for key, df_behaviors in dict_df_behaviors.items():
    #大きすぎるので、100分割して処理
    users = df_behaviors.select('user_id').to_pandas().user_id.unique()
    #100分割
    n = 100
    user_list = np.array_split(users,n)

    df_covisit_count_list = []

    for no,user_list in enumerate(user_list):
        logger.info(f'no : {no} / num_user: {len(user_list)}')

        _df_behaviors = df_behaviors.filter(pl.col('user_id').is_in(user_list))

        df_inview = _df_behaviors.select(['impression_id','article_ids_inview']).explode('article_ids_inview')
        df_covisit = df_inview.join(df_inview,how='left',on='impression_id').unique()

        logger.info(f'df_covisit: {df_covisit.shape}')
        df_covisit_count = df_covisit.group_by(['article_ids_inview','article_ids_inview_right']).agg(
            pl.count('impression_id').alias('count')
        )
        df_covisit_count.columns = ['article_id','article_id_right','cooccur_count']

        df_covisit_count_list.append(df_covisit_count)

    df_covisit_count_all = pl.concat(df_covisit_count_list)

    #再度groupby
    df_covisit_count_all = df_covisit_count_all.group_by(['article_id','article_id_right']).agg(
        pl.sum('cooccur_count').alias('cooccur_count')
    )

    logger.info(f'shape of {key} covisit_count: {df_covisit_count.shape}')

    #save as parquet to /home/data/inview_cooccur
    df_covisit_count_all.write_parquet(f'/home/data/inview_cooccur/{key}_covisit_count.parquet')

[2024-06-16 12:19:50,021][INFO] no : 0 / num_user: 7881
[2024-06-16 12:19:53,263][INFO] df_covisit: (38756633, 3)
[2024-06-16 12:19:54,307][INFO] no : 1 / num_user: 7881
[2024-06-16 12:19:57,706][INFO] df_covisit: (42113327, 3)
[2024-06-16 12:19:58,690][INFO] no : 2 / num_user: 7881
[2024-06-16 12:20:02,624][INFO] df_covisit: (44005860, 3)
[2024-06-16 12:20:03,478][INFO] no : 3 / num_user: 7881
[2024-06-16 12:20:07,118][INFO] df_covisit: (44437540, 3)
[2024-06-16 12:20:08,111][INFO] no : 4 / num_user: 7881
[2024-06-16 12:20:12,123][INFO] df_covisit: (45130074, 3)
[2024-06-16 12:20:13,352][INFO] no : 5 / num_user: 7881
[2024-06-16 12:20:17,172][INFO] df_covisit: (39615634, 3)
[2024-06-16 12:20:18,097][INFO] no : 6 / num_user: 7881
[2024-06-16 12:20:21,762][INFO] df_covisit: (40383206, 3)
[2024-06-16 12:20:22,582][INFO] no : 7 / num_user: 7881
[2024-06-16 12:20:26,523][INFO] df_covisit: (42984631, 3)
[2024-06-16 12:20:27,414][INFO] no : 8 / num_user: 7881
[2024-06-16 12:20:30,628][INFO] 