# Import Libraries

In [2]:
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from pathlib import Path
import polars as pl

import torch
import torch.nn.functional as F

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 1000)

#Logging
import logging

# Get logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('logs.log')

# Set log format
formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Settings to display log on notebook
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# Data Load

In [6]:
TRAIN_DIR = Path('/home/data/train_large')
TEST_DIR = Path('/home/data/ebnerd_testset')

In [7]:
trn_behaviors = pl.read_parquet(TRAIN_DIR/'train'/'behaviors.parquet')
trn_history = pl.read_parquet(TRAIN_DIR/'train'/'history.parquet')

val_behaviors = pl.read_parquet(TRAIN_DIR/'validation'/'behaviors.parquet')
val_history = pl.read_parquet(TRAIN_DIR/'validation'/'history_extended.parquet')

articles = pl.read_parquet(TRAIN_DIR/'articles.parquet')

# Extend Validation/Test History Files

In [None]:
_trn_history = trn_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])
_val_history = val_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])

val_history_extended = pl.concat([
    _trn_history.filter(pl.col("user_id").is_in(set(val_history["user_id"]) & set(trn_history["user_id"]))),
    _val_history
])
val_history = val_history_extended.sort(
    ["user_id", "impression_time_fixed"]
).unique().groupby("user_id").agg(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])

val_history.write_parquet(TRAIN_DIR/'validation'/"history_extended.parquet")

In [8]:
trn_behaviors = pl.read_parquet(TRAIN_DIR/'train'/'behaviors.parquet')
trn_history = pl.read_parquet(TRAIN_DIR/'train'/'history.parquet')

val_behaviors = pl.read_parquet(TRAIN_DIR/'validation'/'behaviors.parquet')
val_history = pl.read_parquet(TRAIN_DIR/'validation'/'history.parquet')

test_behaviors = pl.read_parquet(TEST_DIR/'test'/'behaviors.parquet')
test_history = pl.read_parquet(TEST_DIR/'test'/'history.parquet')

articles = pl.read_parquet(TRAIN_DIR/'articles.parquet')

_trn_history = trn_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])
_val_history = val_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])
_test_history = test_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])

test_history_extended = pl.concat([
    _trn_history.filter(pl.col("user_id").is_in(set(test_history["user_id"]) & set(trn_history["user_id"]))),
    _val_history.filter(pl.col("user_id").is_in(set(test_history["user_id"]) & set(val_history["user_id"]))),
    _test_history
])
test_history = test_history_extended.sort(
    ["user_id", "impression_time_fixed"]
).unique().groupby("user_id").agg(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])

# Create Dataset

In [9]:
def get_target_df(
        df: pl.DataFrame
) -> pl.DataFrame:
    '''
    Create target_df. The target column is "is_clicked"
    '''
    
    df = df.with_columns(
    pl.col("article_ids_inview").apply(lambda x: len(x)).alias("count_article_ids_inview"),
    )

    # Explode article_ids_inview
    df = df.explode("article_ids_inview")

    # If article_ids_inview is in article_ids_clicked, then 1, otherwise 0
    df = df.with_columns(
        pl.when(pl.col("article_ids_inview").is_in(pl.col("article_ids_clicked")))
        .then(1)
        .otherwise(0)
        .alias("is_clicked")
    )

    # Delete columns that are not used
    df = df.drop(["article_ids_clicked","next_read_time","next_scroll_percentage","article_id"])

    # Change the name of article_ids_inview to article_id
    df = df.with_columns(pl.col("article_ids_inview").alias("article_id"))
    df = df.drop("article_ids_inview")

    # Calculate the rate of is_clicked
    is_clicked_rate = df.select("is_clicked").mean().to_pandas().iloc[0,0]
    logger.info(f'is_clicked_rate: {is_clicked_rate}')

    # Show the shape of df
    logger.info(f'df shape: {df.shape}')

    return df

# Feature Engineering

### Inview Cooccurance Feature

In [10]:
def feat_inview_cooccur(
        df: pl.DataFrame, mode: str
) -> pl.DataFrame:
    '''
    Create features that show the number of co-visits between articles.
    '''

    _len = len(df)

    # Load the pre-computed co-visit counts
    df_covisit_count = pl.read_parquet(f'/home/data/inview_cooccur/{mode}_covisit_count.parquet')

    tmp_df = df.select(['impression_id','user_id','article_id'])

    # Remove impression_id = 0
    tmp_df = tmp_df.filter(pl.col('impression_id') != 0)

    tmp_df = tmp_df.join(tmp_df, on = ['impression_id','user_id'],how='left')

    tmp_df = tmp_df.join(df_covisit_count, on = ['article_id','article_id_right'],how='left')

    feat_df = tmp_df.groupby(['impression_id','user_id','article_id']).agg(
    pl.sum('cooccur_count').alias('cooccur_count_sum'),
    pl.mean('cooccur_count').alias('cooccur_count_mean'),
    pl.max('cooccur_count').alias('cooccur_count_max'),
    pl.min('cooccur_count').alias('cooccur_count_min'),
    pl.std('cooccur_count').alias('cooccur_count_std'),
    )

    # Normalize each feature by the maximum value for each impression_id
    feat_df = feat_df.join(
        feat_df.groupby('impression_id').agg(
            pl.max('cooccur_count_sum').alias('cooccur_count_sum_max'),
            pl.max('cooccur_count_mean').alias('cooccur_count_mean_max'),
            pl.max('cooccur_count_max').alias('cooccur_count_max_max'),
            pl.max('cooccur_count_min').alias('cooccur_count_min_max'),
            pl.max('cooccur_count_std').alias('cooccur_count_std_max'),
        ),
        on='impression_id'
    )
    
    # Normalize each feature by the maximum value for each impression_id
    feat_df = feat_df.with_columns(
        (pl.col('cooccur_count_sum')/pl.col('cooccur_count_sum_max')).alias('cooccur_count_sum_norm'),
        (pl.col('cooccur_count_mean')/pl.col('cooccur_count_mean_max')).alias('cooccur_count_mean_norm'),
        (pl.col('cooccur_count_max')/pl.col('cooccur_count_max_max')).alias('cooccur_count_max_norm'),
        (pl.col('cooccur_count_min')/pl.col('cooccur_count_min_max')).alias('cooccur_count_min_norm'),
        (pl.col('cooccur_count_std')/pl.col('cooccur_count_std_max')).alias('cooccur_count_std_norm'),
    )

    feat_df = feat_df.drop(['cooccur_count_sum_max','cooccur_count_mean_max','cooccur_count_max_max','cooccur_count_min_max','cooccur_count_std_max'])
    df = df.join(feat_df, on = ['impression_id','user_id','article_id'],how='left')

    assert _len == len(df)

    return df

### History Click Counts

In [11]:
def feat_clicked_history_count(
        df: pl.DataFrame, 
        df_history: pl.DataFrame
) -> pl.DataFrame:
    '''
    Add features that show the number of times an article has been clicked in the past.
    '''
    _len = len(df)

    user_article = df.select(['article_id','user_id','impression_time'])

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']

    # Get the combination of user_id and article_id_fixed
    df_history = df_history.join(user_article, on=['user_id','article_id'], how='inner')

    feat_df = df_history.groupby(['user_id','article_id']).agg(
        # Count the number of times the article has been read
        pl.count('impression_time_history').alias('article_read_count'),
        # Get the latest time the article was read
        pl.max('impression_time_history').alias('article_last_read_time'),
    )

    df = df.join(feat_df, on=['user_id','article_id'], how='left')

    # Convert the difference between last_read_time and impression_time to X hours
    df = df.with_columns(
        ((pl.col('impression_time') - pl.col('article_last_read_time')) / timedelta(hours=1)).alias('article_last_read_time_diff')
    ).drop('article_last_read_time')

    # Fill the missing values in read_count, max_scroll_percentage, max_read_time with 0
    df = df.with_columns([
        pl.col("article_read_count").fill_null(0),
    ])

    assert _len == len(df)
    
    return df

In [12]:
def feat_clicked_history_count_by_article(
        df: pl.DataFrame, 
        df_history: pl.DataFrame
) -> pl.DataFrame:
    '''
    Add features that show the number of times an article has been clicked in the past.
    '''
    _len = len(df)

    user_article = df.select(['article_id','impression_time'])

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']

    # Get the combination of user_id and article_id_fixed
    df_history = df_history.join(user_article, on=['article_id'], how='inner')

    feat_df = df_history.groupby(['article_id']).agg(
        # Count the number of times the article has been read
        pl.count('impression_time_history').alias('article_read_count_v2'),
    )

    df = df.join(feat_df, on=['article_id'], how='left')
    
    assert _len == len(df)
    
    return df

In [13]:
def load_vector_df(path_str: str) -> pl.DataFrame:
    '''
    Load the vector. The vector is provided by the original data.
    '''
    _vec = pd.read_parquet(path_str)

    logger.info(f'_vec columns : {_vec.columns}')

    col_name = _vec.columns[-1]

    df_vec = _vec.apply(lambda row: pd.Series(row[col_name]), axis=1)
    df_vec.columns = [f'vector_{i}' for i in range(df_vec.shape[1])]

    df_vec['article_id'] = _vec['article_id']

    df_vec = pl.from_pandas(df_vec)

    logger.info(f'{path_str} shape: {df_vec.shape}')

    return df_vec

def load_my_vector_df(path_str: str) -> pl.DataFrame:
    df_vec = pl.read_parquet(path_str)
    return df_vec
    

common_vec_dict = {
    'multilingual-e5-large-instruct':load_my_vector_df('/home/data/multilingual-e5-large-instruct/multilingual-e5-large-instruct_vec_df.parquet')
}

my_vec_dict = {
    'item2vec':{
        'train':load_my_vector_df('/home/data/item2vec_1/train_item2vec.parquet'),
        'valid':load_my_vector_df('/home/data/item2vec_1/valid_item2vec.parquet'),
        'test':load_my_vector_df('/home/data/item2vec_1/test_item2vec.parquet'),
    }
}


In [None]:
def feat_clicked_cossim(
        df: pl.DataFrame,
        df_history: pl.DataFrame,
        df_vec: pl.DataFrame,
        cossim_name: str,
        type: str='mean',
        window: int=1,
) -> pl.DataFrame:
    '''
    type:
     mean: Cosine similarity between the average vector of the user's past actions and the article vector
     scroll_mean: Cosine similarity between the average vector of the user's past actions and the article vector
     read_time_mean: Cosine similarity between the average vector of the user's past actions and the article vector
    '''

    _len = len(df)

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']
    df_history = df_history.fill_null(0)

    user_article = df.select(['article_id','user_id','impression_time','impression_id'])

    df_history = df_history.join(df_vec, on='article_id', how='left')
    vec_cols = [col for col in df_history.columns if 'vector' in col]

    # User embeddings
    # How many past actions to consider with the window (if window is 0, use the entire history)
    if window > 0:
        df_history = df_history.sort('impression_time_history', descending=True).groupby('user_id').head(window)

    if type == 'mean':
        user_emb = df_history.groupby('user_id').agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        ) # Simple mean of the latest article emb
    elif type == 'max':
        user_emb = df_history.groupby('user_id').agg(
            *[pl.max(col).alias(col) for col in vec_cols]
        ) # Simple max of the latest article emb
    elif type == 'min':
        user_emb = df_history.groupby('user_id').agg(
            *[pl.min(col).alias(col) for col in vec_cols]
        ) # Simple min of the latest article emb
    elif type == 'scroll_mean':
        # Scroll_percentage based mean of the latest article emb
        for col in vec_cols:
            df_history = df_history.with_columns(
                (pl.col(col) * pl.col('scroll_percentage_history') / 100).alias(col)
            )
        user_emb = df_history.groupby('user_id').agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )
    elif type == 'read_time_mean':
        # Read time based mean of the latest article emb
        user_read_time = df_history.groupby('user_id').agg(
            pl.sum('read_time_history').alias('user_read_time')
        )
        df_history = df_history.join(user_read_time, on='user_id', how='left')
        df_history = df_history.with_columns(
            (pl.col('read_time_history') / pl.col('user_read_time')).alias('read_time_percentage')
        )
        for col in vec_cols:
            df_history = df_history.with_columns(
                (pl.col(col) * pl.col('read_time_percentage')).alias(col)
            )
        user_emb = df_history.groupby('user_id').agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )
    
    user_emb.columns = [f'user_{col}' if col != 'user_id' else col for col in user_emb.columns]

    # Article embeddings
    article_emb = df_vec.clone()
    article_emb.columns = [f'article_{col}' if col != 'article_id' else col for col in article_emb.columns]

    # Join user and article embeddings
    user_article = user_article.join(
    user_emb, on='user_id', how='left').join(
        article_emb, on='article_id', how='left')

    # Calculate cosine similarities
    user_vec = user_article.select([col for col in user_article.columns if 'user_vec' in col]).to_numpy()
    article_vec = user_article.select([col for col in user_article.columns if 'article_vec' in col]).to_numpy()

    user_vec = torch.tensor(user_vec)
    article_vec = torch.tensor(article_vec)

    similarity = F.cosine_similarity(
        user_vec, article_vec, dim=1)
    
    # Add as a feature
    user_article = user_article.with_columns(
        pl.Series(similarity.numpy()).alias(cossim_name)
    )

    feat_df = user_article.select(['user_id','article_id','impression_id',cossim_name])

    df = df.join(feat_df, on=['user_id','article_id','impression_id'], how='left')

    assert _len == len(df)

    return df

In [18]:
def feat_clicked_category_cossim(
        df: pl.DataFrame,
        df_history: pl.DataFrame,
        df_vec: pl.DataFrame,
        cossim_name: str,
        type: str='mean',
        window: int=1,
) -> pl.DataFrame:
    '''
    type:
     mean: Cosine similarity between the average vector of the user's past actions and the article vector
     scroll_mean: Cosine similarity between the average vector of the user's past actions and the article vector
     read_time_mean: Cosine similarity between the average vector of the user's past actions and the article vector
    '''
    _len = len(df)

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']
    df_history = df_history.fill_null(0)
    df_history = df_history.join(articles.select(['article_id', 'category']), how='left', on='article_id')

    user_article = df.select(['article_id','user_id','category','impression_time','impression_id'])

    df_history = df_history.join(df_vec, on='article_id', how='left')
    vec_cols = [col for col in df_history.columns if 'vector' in col]

    # User embeddings
    # How many past actions to consider with the window (if window is 0, use the entire history)
    if window > 0:
        df_history = df_history.sort('impression_time_history',descending=True
                                     ).groupby(['user_id', 'category']).head(window)

    if type == 'mean':
        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )

    elif type == 'min':
        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.min(col).alias(col) for col in vec_cols]
        )
        
    elif type == 'max':
        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.max(col).alias(col) for col in vec_cols]
        )

    elif type == 'scroll_mean':
        for col in vec_cols:
            df_history = df_history.with_columns(
                (pl.col(col) * pl.col('scroll_percentage_history') / 100).alias(col)
            )
        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )

    elif type == 'read_time_mean':
        user_read_time = df_history.groupby(['user_id', 'category']).agg(
            pl.sum('read_time_history').alias('user_read_time')
        )
        df_history = df_history.join(user_read_time, on=['user_id', 'category'], how='left')
        df_history = df_history.with_columns(
            (pl.col('read_time_history') / pl.col('user_read_time')).alias('read_time_percentage')
        )
        for col in vec_cols:
            df_history = df_history.with_columns(
                (pl.col(col) * pl.col('read_time_percentage')).alias(col)
            )

        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )
    
    user_emb.columns = [f'user_{col}' if col not in ['user_id', 'category'] else col for col in user_emb.columns]


    # Article embeddings
    article_emb = df_vec.clone()
    article_emb.columns = [f'article_{col}' if col != 'article_id' else col for col in article_emb.columns]

    # Join user and article embeddings
    user_article = user_article.join(
    user_emb, on=['user_id', 'category'], how='left').join(
        article_emb, on='article_id', how='left')

    # Calculate cosine similarities
    user_vec = user_article.select([col for col in user_article.columns if 'user_vec' in col]).to_numpy()
    article_vec = user_article.select([col for col in user_article.columns if 'article_vec' in col]).to_numpy()

    user_vec = torch.tensor(user_vec)
    article_vec = torch.tensor(article_vec)


    similarity = F.cosine_similarity(
        user_vec, article_vec, dim=1)
    
    # Add as a feature
    user_article = user_article.with_columns(
        pl.Series(similarity.numpy()).alias(cossim_name)
    )

    feat_df = user_article.select(['user_id','category','article_id','impression_id',cossim_name])

    df = df.join(feat_df, on=['user_id','category','article_id','impression_id'], how='left')

    assert _len == len(df)

    return df

In [19]:
def add_cossim_individual_features(
        df: pl.DataFrame, 
        df_history: pl.DataFrame, 
        df_vec: pl.DataFrame, 
        cossim_name: str
) -> pl.DataFrame:
    """表示された記事について、過去にクリックされた記事との類似度に関する特徴量を生成。dfはターゲットdf。"""

    _len = len(df)

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']
    df_history = df_history.fill_null(0).rename({"article_id": "past_article_id"})

    user_article = df.select(['article_id','user_id','impression_time','impression_id'])
    user_article = user_article.join(df_history.select(["user_id", "past_article_id"]), on="user_id")

    user_article_unique = user_article.unique(["article_id", "past_article_id"]).drop(["user_id", "impression_time", "impression_id"]) # 表示された記事と過去に表示された記事の組み合わせ

    #articleの表現
    article_emb = df_vec.clone()
    article_emb.columns = [f'article_{col}' if col != 'article_id' else col for col in article_emb.columns]

    #join
    user_article_unique = user_article_unique.join(
        article_emb, on='article_id', how='left') # 表示された記事にembを付与


    article_emb.columns = ["past_" + col if col != "article_id" else col for col in article_emb.columns]

    user_article_unique = user_article_unique.join(
        article_emb, left_on='past_article_id', right_on="article_id", how='left') # 過去の記事にembを付与

    article_vec = user_article_unique.select(
        [col for col in user_article_unique.columns if ('article_vec' in col) & ("past_article_vec" not in col)]
    ).to_numpy()

    past_article_vec = user_article_unique.select(
        [col for col in user_article_unique.columns if ("past_article_vec" in col)]
    ).to_numpy()

    article_vec = torch.tensor(article_vec)
    past_article_vec = torch.tensor(past_article_vec)

    similarity = F.cosine_similarity(
        article_vec, past_article_vec, dim=1) # article_vec, past_article_vecの行ベクトル同士で類似度を取る。

    article_similarities = pl.DataFrame(
        {
            "article_id": user_article_unique["article_id"],
            "past_article_id": user_article_unique["past_article_id"],
            "cossim": similarity.numpy()
        }
    )

    user_article = user_article.join(article_similarities, how="left", on=["article_id", "past_article_id"])

    features = user_article.groupby(["impression_id", "article_id"]).agg(
        pl.col("cossim").max().alias(f"{cossim_name}_max_cossim"),
        pl.col("cossim").min().alias(f"{cossim_name}_min_cossim"),
        pl.col("cossim").mean().alias(f"{cossim_name}_mean_cossim"),
        pl.col("cossim").median().alias(f"{cossim_name}_median_cossim"),
        pl.col("cossim").quantile(0.1).alias(f"{cossim_name}_quantile_q10_cossim"),
        pl.col("cossim").quantile(0.25).alias(f"{cossim_name}_quantile_q25_cossim"),
        pl.col("cossim").quantile(0.75).alias(f"{cossim_name}_quantile_q75_cossim"),
        pl.col("cossim").quantile(0.9).alias(f"{cossim_name}_quantile_q90_cossim"),
    )
    
    df = df.join(features, how="left", on=["impression_id", "article_id"])
    
    # 標準化処理は少し効くらしい。
    for col in [
        f"{cossim_name}_max_cossim",
        f"{cossim_name}_min_cossim",
        f"{cossim_name}_mean_cossim",
        f"{cossim_name}_median_cossim",
        f"{cossim_name}_quantile_q10_cossim",
        f"{cossim_name}_quantile_q25_cossim",
        f"{cossim_name}_quantile_q75_cossim",
        f"{cossim_name}_quantile_q90_cossim",
    ]:
            #impressionごとに最大値を1として基準化
        df = df.join(
            df.groupby('impression_id').agg(
                pl.max(col).alias(f'max_{col}'),
                pl.min(col).alias(f'min_{col}'),
            ), on='impression_id', how='left'
        )

        df = df.with_columns(
            ((pl.col(col) - pl.col(f'min_{col}')) / (pl.col(f'max_{col}') - pl.col(f'min_{col}'))).alias(f'{col}_norm')
        ).drop([f'max_{col}', f'min_{col}', col])

    return df

In [20]:
def feat_inview_cossim(
        df:pl.DataFrame,
        df_vec:pl.DataFrame,
        cossim_name:str,
):
        '''
        inviewでユーザーのベクトルを作成し、そのベクトルと記事のベクトルのコサイン類似度を計算する
        groupとして、user_id, session_id, impression_idを使う
        '''

        _len = len(df)

        #articleの表現
        article_emb = df_vec.clone()
        article_emb.columns = [f'article_{col}' if col != 'article_id' else col for col in article_emb.columns]

        #userの表現
        df_user = df.select(['impression_id','user_id','article_id','session_id']).clone()
        df_user = df_user.join(df_vec, on='article_id', how='left')
        vec_cols = [col for col in df_user.columns if 'vector' in col]

        for group_col in ['user_id','session_id','impression_id']:
            user_emb = df_user.groupby(group_col).agg(
                    *[pl.mean(col).alias(col) for col in vec_cols]
                    )
            user_emb.columns = [f'user_{col}' if col != group_col else col for col in user_emb.columns]

            #join
            user_article = df_user.join(
                    user_emb, on=group_col, how='left').join(
                    article_emb, on='article_id', how='left')
        
            #類似度
            user_vec = user_article.select([col for col in user_article.columns if 'user_vec' in col]).to_numpy()
            article_vec = user_article.select([col for col in user_article.columns if 'article_vec' in col]).to_numpy()

            user_vec = torch.tensor(user_vec)
            article_vec = torch.tensor(article_vec)

            similarity = F.cosine_similarity(
                    user_vec, article_vec, dim=1)
            
            #user_articleに特徴量として追加する
            user_article = user_article.with_columns(
                    pl.Series(similarity.numpy()).alias(f'{cossim_name}_{group_col}')
                    )
            
            #特徴量として追加
            feat_df = user_article.select(['impression_id','user_id','article_id',f'{cossim_name}_{group_col}'])

            df = df.join(feat_df, on=['impression_id','user_id','article_id'], how='left')

            assert _len == len(df)

        return df

### Articleのシンプルな特徴量

In [21]:
def feat_article_simple(df,articles):
    _len = len(df)
    
    articles_numfeat = articles.select([
    'article_id','premium','published_time','total_inviews','total_pageviews','total_read_time',
    'sentiment_score','sentiment_label', 'image_ids'])
    
    articles_numfeat = articles_numfeat.with_columns(pl.col("image_ids").list.lengths().alias("number_of_images")).drop("image_ids")
    
    #sentiment_labelのPositive/Neutral/Negative別にカラムを持たせる（iwaiさん）→少し上がる
    articles_numfeat = articles_numfeat.with_columns(
            pl.when(pl.col('sentiment_label') == 'Positive').then(pl.col('sentiment_score')).otherwise(0).alias('article_label_positive_score'),
            pl.when(pl.col('sentiment_label') == 'Negative').then(pl.col('sentiment_score')).otherwise(0).alias('article_label_negative_score'),
            pl.when(pl.col('sentiment_label') == 'Neutral').then(pl.col('sentiment_score')).otherwise(0).alias('article_label_neutral_score')
    ).drop('sentiment_score','sentiment_label')
    df = df.join(articles_numfeat, on='article_id', how='left')

    #impression_timeとpublished_timeの差をX日に変換
    df = df.with_columns(
        ((pl.col('impression_time') - pl.col('published_time')) / timedelta(days=1)).alias('published_time_diff')
    ).drop('published_time')

    #total_inviews,total_pageviews,total_read_timeについて,impressionごとに最大値を1として基準化
    df = df.join(
        df.groupby('impression_id').agg(
            pl.max('total_inviews').alias('max_total_inviews'),
            pl.max('total_pageviews').alias('max_total_pageviews'),
            pl.max('total_read_time').alias('max_total_read_time')
        ), on='impression_id', how='left'
    )

    df = df.with_columns(
        (pl.col('total_inviews') / pl.col('max_total_inviews')).alias('total_inviews_norm'),
        (pl.col('total_pageviews') / pl.col('max_total_pageviews')).alias('total_pageviews_norm'),
        (pl.col('total_read_time') / pl.col('max_total_read_time')).alias('total_read_time_norm')
    ).drop(['max_total_inviews','max_total_pageviews','max_total_read_time'])

    #published_time_diffについて、impressionごとに最小値を1として基準化
    df = df.join(
        df.groupby('impression_id').agg(
            pl.min('published_time_diff').alias('min_published_time_diff')
        ), on='impression_id', how='left'
    )

    df = df.with_columns(
        (pl.col('published_time_diff') / pl.col('min_published_time_diff')).alias('published_time_diff_norm')
    ).drop('min_published_time_diff')


    #premiumもis_subscriberもtrueなら1、それ以外は0
    df = df.with_columns(
        pl.when(pl.col('premium') & pl.col('is_subscriber')).then(1).otherwise(0).alias('is_premium_subscriber')
    )

    df = df.join(articles.select(["article_id", "category"]), how="left", on="article_id")

    #assert
    assert _len == len(df)

    #logger.info(f'feat_article_simple shape: {df.shape}')

    return df

### Userが最後に読んだ記事の時間とarticleのpublishされた時間

In [22]:
def feat_user_last_impression_publish_time_diff(
    df,df_history,articles
    ):

    shape = df.shape

    df_history = df_history.explode(['impression_time_fixed', 'scroll_percentage_fixed', 'article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id', 'impression_time_history', 'scroll_percentage_history', 'article_id','read_time_history']

    #user_idごとにimpression_time_historyの最大値を取得
    user_last_impression_time = df_history.groupby('user_id').agg(
        pl.max('impression_time_history').alias('user_last_impression_time')
    )

    df = df.join(
        user_last_impression_time, on='user_id', how='left'
        ).join(
            articles.select(['article_id','published_time']), on='article_id', how='left'
        )

    #user_last_impression_timeとpublished_timeの差
    df = df.with_columns(
        ((pl.col('user_last_impression_time') - pl.col('published_time')) / timedelta(days=1)).alias('user_last_impression_time-publish_time_diff')
    ).drop(['user_last_impression_time','published_time'])

    #shape.check
    assert df.shape[0] == shape[0]

    return df

### Articleの人気特徴量

In [23]:
# modeはtrainとか。各時間帯におけるinview(表示数)
def feat_article_pop(df:pl.DataFrame,
                     mode:str,
                     df_pop_path:str = '/home/data/article_pop_inview/',
                     metacol:str=None):
    _len = len(df)

    #time_interval_list
    time_interval_list = ['1m','2m','3m','5m','10m','15m','20m','30m','1h','2h','3h','6h','12h','24h'] 

    for time_interval in time_interval_list:
        if not metacol:
            file_path = f'{df_pop_path}{mode}_article_pop_inview_{time_interval}.parquet'
        else:
            file_path = f'{df_pop_path}{mode}_article_pop_inview_{time_interval}_{metacol}.parquet'

        try:
            df_pop = load_my_vector_df(file_path)
        except:
            logger.warning(f'{file_path} not found')
            continue

        #rounded_inview
        df = df.with_columns(
            pl.col("impression_time").dt.truncate(time_interval).alias(f'rounded_{time_interval}_datetime')
        )

        if not metacol:
            df = df.join(df_pop, on=['article_id',f'rounded_{time_interval}_datetime'], how='left').drop(
                [f'rounded_{time_interval}_datetime']
            )
        else:
            df = df.join(df_pop, on=['article_id',f'rounded_{time_interval}_datetime',metacol], how='left').drop(
                [f'rounded_{time_interval}_datetime']
            )

        #impressionごとに最大値を1として基準化
        if not metacol:
            colname = f'rounded_{time_interval}_inview_count'
        else:
            colname = f'rounded_{time_interval}_inview_count_{metacol}'

        df = df.join(
            df.groupby('impression_id').agg(
                pl.max(colname).alias(f'max_{colname}')
            ), on='impression_id', how='left'
        )

        df = df.with_columns(
            (pl.col(colname) / pl.col(f'max_{colname}')).alias(f'{colname}_norm')
        ).drop(f'max_{colname}')

    assert _len == len(df)

    return df

## 統計特徴量

In [24]:
# 雑に有力な特徴量について統計量を追加
def add_statistic(df):
    target_cols = ['total_inviews', 'total_pageviews', 'total_read_time', 'rounded_1m_inview_count', 'rounded_2m_inview_count',
                     'rounded_3m_inview_count','rounded_5m_inview_count','rounded_10m_inview_count','rounded_15m_inview_count','rounded_20m_inview_count',
                     'rounded_30m_inview_count','rounded_1h_inview_count','rounded_3h_inview_count','rounded_6h_inview_count','rounded_12h_inview_count','rounded_24h_inview_count',
                     'inview_count', 'past_inview_count', 'time_gap_to_next_inview_impression_time', 'time_gap_from_prev_inview_impression_time', 'published_time_diff'
                    ]

    # 統計量
    operations = ["max", "min", "mean", "median", "std", "skew", "kurtosis"]

    # グループごとの集計結果を保存するリスト
    aggregations = []

    # 各列ごとに集計操作を定義
    for col in target_cols:
        for op in operations:
            agg_expr = getattr(pl.col(col), op)().alias(f"{col}_{op}")
            aggregations.append(agg_expr)

    # user_id ごとに集計を実行
    df_agg = df.groupby("impression_id").agg(aggregations)
    df = df.join(df_agg, on="impression_id", how="left")
    
    # diff系を追加
    for col in target_cols:
        if col in ['total_read_time','total_inviews', 'total_pageviews']: #精度に悪影響そうなので除外
            continue

        for op in ["max", "mean", "median", "min"]:
            df = df.with_columns((pl.col(f"{col}_{op}") - pl.col(f"{col}")).alias(f"{col}_{op}-{col}"))
            df = df.with_columns((pl.col(f"{col}_{op}") / pl.col(f"{col}")).alias(f"{col}_{op}/{col}"))
    
    return df

### 過去のカテゴリの比率

In [25]:
# ユーザーが過去に見たカテゴリに関する情報。効く
def add_past_category_ratios(df, df_history):

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']
    df_history = df_history.fill_null(0)

    # Joining training history data with article data
    df_history_articles = df_history.join(articles, on='article_id', how='left')
    user_category_df = df_history_articles.groupby(["user_id", "category"]).count().rename({"count": "past_category_count"})

    user_category_df = user_category_df.with_columns(pl.col("past_category_count").max().over("user_id").alias("max_past_category_count_count"))
    user_category_df = user_category_df.with_columns((pl.col("past_category_count") / pl.col("max_past_category_count_count")).alias("past_category_ratio"))

    # df_history_articles_subcategory = df_history_articles.with_columns((1 / pl.col("subcategory").count().over(["user_id", "impression_time_history", "article_id"])).alias("subcategory_ratio"))
    # user_subcategory_df = df_history_articles_subcategory.groupby(["user_id", "subcategory"]).agg(pl.col("subcategory_ratio").sum().alias("past_subcategory_count"))
    # user_subcategory_df = user_subcategory_df.with_columns(pl.col("past_subcategory_count").max().over("user_id").alias("max_past_subcategory_count"))
    # user_subcategory_df = user_subcategory_df.with_columns((pl.col("past_subcategory_count") / pl.col("max_past_subcategory_count")).alias("past_subcategory_ratio"))

#     df = df.join(articles.select(["article_id", "category"]), on="article_id", how="left")
    df = df.join(user_category_df, on=["user_id", "category"], how="left")# .drop("category")
    # df = df.join(user_subcategory_df, on=["user_id", "subcategory"], how="left").drop("subcategory")

    return df

### inviewのカウント特徴量

In [26]:
# inview回数に関する特徴量。いわゆるリークを含む（総表示回数や、次の表示など）。
def calculate_inview_counts(df):
    df = df.sort("impression_time")

    df = df.with_columns(
        [
            pl.col("impression_id").count().over(["user_id", "article_id"]).alias("inview_count"),
            pl.col("impression_id").cumcount().over(["user_id", "article_id"]).alias("past_inview_count")
        ]
    )
    
    df = df.with_columns([
        pl.col("impression_time").shift(1).over(["user_id", "article_id"]).alias("next_inview_impression_time"), # これは一つ前のprev?
        pl.col("impression_time").shift(-1).over(["user_id", "article_id"]).alias("prev_inview_impression_time") # これは一つ先のimp?
    ])

    df = df.with_columns(
        [
            ((pl.col("next_inview_impression_time") - pl.col("impression_time")) / timedelta(hours=1)).alias("time_gap_to_next_inview_impression_time"),
            ((pl.col("impression_time") - pl.col("prev_inview_impression_time")) / timedelta(hours=1)).alias("time_gap_from_prev_inview_impression_time"),
        ]
    ).drop(["next_inview_impression_time", "prev_inview_impression_time"])
    
    return df

### あらいさんのcossim by category

In [27]:
# +0.003程度の効果あり。
# 現状、対象のカテゴリ：["device_type", "is_subscriber", "is_sso_user"]
# カテゴリの値で重み付平均をとってuser embを求めている。それとarticle のembのcossimを求める　
def add_cossim_by_category(behaviors_df, history_df, emb_df, cossim_name, cat_col):
    # weight
    cat_vals = behaviors_df[cat_col].unique().to_list()
    # print(f"cat_vals: {cat_vals}")
    cat_weight_df = behaviors_df.select(["user_id", cat_col])
    weight_cols = []
    for cat_val in cat_vals:
        cat_weight_df = cat_weight_df.with_columns((pl.col(cat_col)==cat_val).alias(f"{cat_col}__{cat_val}"))
        weight_cols.append(f"{cat_col}__{cat_val}")
    cat_weight_df = cat_weight_df.groupby("user_id").agg([pl.col(col).mean() for col in weight_cols]) # ユーザーごとの、カテゴリの各valueの割合

    # add user emb
    _history_df = history_df.join(
        cat_weight_df,
        on="user_id",
        how="left",
    )

    user_emb_df = _history_df.join(
        emb_df.rename({col: f"userx__{col}" for col in emb_df.columns if col != "article_id"}),
        on="article_id",
        how="left",
    )
    user_emb_cols = [col for col in user_emb_df.columns if "userx__" in col]

    # weighted cateogory emb
    weighted_emb = None
    for weight_col in weight_cols:
        _weighted_emb = (user_emb_df.select(user_emb_cols) * user_emb_df[weight_col]).sum() / user_emb_df[weight_col].sum() # weight_colの値でhistoryの各user embの重み付平均をとる。
        if weighted_emb is not None:
            weighted_emb += cat_weight_df[weight_col] * _weighted_emb
        else:
            weighted_emb = cat_weight_df[weight_col] * _weighted_emb # その後、各ユーザのカテゴリの重みをかける。
    weighted_emb = pl.concat([cat_weight_df.select("user_id"), weighted_emb], how="horizontal")
    _behaviors_df = behaviors_df.join(weighted_emb, on="user_id", how="left")

    # article emb
    _behaviors_df = _behaviors_df.join(
        emb_df.rename({col: f"article__{col}" for col in emb_df.columns if col != "article_id"}),
        on="article_id",
        how="left",
    )
    article_emb_cols = [col for col in _behaviors_df.columns if "article__" in col]

    # cossim
    cossims = (_behaviors_df[user_emb_cols] * _behaviors_df[article_emb_cols]).sum(axis=1) / \
        (np.linalg.norm(_behaviors_df[user_emb_cols], axis=1) * np.linalg.norm(_behaviors_df[article_emb_cols], axis=1))
    behaviors_df = behaviors_df.with_columns(
        pl.lit(cossims).alias(cossim_name),
    )

    return behaviors_df

In [28]:
def normalize_rank_by_percentage(df):
    
    rank_cols = [
        'published_time_diff_rank',
        'total_inviews_norm_rank',
        'total_pageviews_norm_rank',
        'total_read_time_norm_rank',
        'rounded_1m_inview_count_rank',
        'rounded_2m_inview_count_rank',
        'rounded_3m_inview_count_rank',
        'rounded_5m_inview_count_rank',
        'rounded_10m_inview_count_rank',
        'rounded_15m_inview_count_rank',
        'rounded_20m_inview_count_rank',
        'rounded_30m_inview_count_rank',
        'rounded_1h_inview_count_rank',
        'rounded_3h_inview_count_rank',
        'rounded_6h_inview_count_rank',
        'rounded_12h_inview_count_rank',
        'rounded_24h_inview_count_rank',
        'inview_count_rank',
        'past_inview_count_rank',
        'time_gap_to_next_inview_impression_time_rank',
        'time_gap_from_prev_inview_impression_time_rank',
        'next_impression_id_mean_cossim_rank',
        'next_impression_id_min_cossim_rank',
        'next_impression_id_max_cossim_rank'
    ]

    impression_id_value_counts = df.groupby("impression_id").count().rename({"count": "impression_id_article_counts"})
    df = df.join(impression_id_value_counts, how="left", on="impression_id")

    expressions = [pl.col(col) / pl.col("impression_id_article_counts") for col in rank_cols]
    df = df.with_columns(expressions).drop("impression_id_article_counts")
    return df

In [29]:
def add_next_impression_id_cossim(
        df: pl.DataFrame, 
        df_vec: pl.DataFrame, 
        method: str = "mean"
) -> pl.DataFrame:
    
    colname = f"next_impression_id_{method}_cossim"

    df = df.sort(["session_id", "impression_time"], descending=[False, False])

    tmp_df = df.select(["impression_id", "session_id", "impression_time"])
    tmp_df = tmp_df.unique(["impression_id", "session_id", "impression_time"])
    tmp_df = tmp_df.sort(["session_id", "impression_time"], descending=[False, False])
    tmp_df = tmp_df.with_columns(pl.col("impression_id").shift(-1).over("session_id").alias("next_impression_id"))

    df = df.join(tmp_df.select(["impression_id", "next_impression_id"]), how="left", on="impression_id")

    df_next_impression_id = df.filter(df["impression_id"].is_in(set(df["next_impression_id"]))).select(["impression_id", "article_id"])
    df_next_impression_id = df_next_impression_id.join(df_vec, how="left", on="article_id")
    
    vec_cols = [col for col in df_next_impression_id.columns if 'vector' in col]
    
    if method == "mean":
        next_impression_id_emb = df_next_impression_id.groupby('impression_id').agg(
                *[pl.mean(col).alias(col) for col in vec_cols]
                )
    elif method == "min":
        next_impression_id_emb = df_next_impression_id.groupby('impression_id').agg(
                *[pl.min(col).alias(col) for col in vec_cols]
                )
    elif method == "max":
        next_impression_id_emb = df_next_impression_id.groupby('impression_id').agg(
                *[pl.max(col).alias(col) for col in vec_cols]
                )
    
    next_impression_id_emb.columns = [f'next_impression_id_{col}' if col != 'impression_id' else col for col in next_impression_id_emb.columns]

    df_current_impression_id = df.select(["impression_id", "article_id", "next_impression_id"])
    df_current_impression_id = df_current_impression_id.join(df_vec, how="left", on="article_id")

    df_current_impression_id.columns = [f'current_impression_id_{col}' if col not in ['article_id', 'impression_id', 'next_impression_id'] else col for col in df_current_impression_id.columns]

    df_crossed = df_current_impression_id.join(next_impression_id_emb, how="left", left_on=["next_impression_id"], right_on=["impression_id"])
    df_crossed = df_crossed.drop_nulls(["next_impression_id"])

    current_impression_id_vec = df_crossed.select([col for col in df_crossed.columns if 'current_impression_id_' in col]).to_numpy()
    next_impression_id_vec = df_crossed.select([col for col in df_crossed.columns if 'next_impression_id_' in col]).to_numpy()

    current_impression_id_vec = torch.tensor(current_impression_id_vec)
    next_impression_id_vec = torch.tensor(next_impression_id_vec)

    similarity = F.cosine_similarity(
            current_impression_id_vec, next_impression_id_vec, dim=1)

    #user_articleに特徴量として追加する
    df_crossed = df_crossed.with_columns(
            pl.Series(similarity.numpy()).alias(colname)
            )

    feat_df = df_crossed.select(['impression_id','article_id',colname])
    df = df.join(feat_df, how="left", on=["impression_id", "article_id"]).drop("next_impression_id")
    return df

In [30]:
def add_published_time_features(
        df: pl.DataFrame, 
        df_history: pl.DataFrame
) -> pl.DataFrame:
    _df_history = df_history.explode(
        [
            "impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"
        ]
    ).rename({
        "impression_time_fixed": "impression_time", 
        "scroll_percentage_fixed": "scroll_percentage", 
        "article_id_fixed": "article_id", 
        "read_time_fixed": "read_time"
    })

    _df_history = _df_history.join(articles, how="left", on="article_id")
    _df_history = _df_history.with_columns(
            ((pl.col('impression_time') - pl.col('published_time')) / timedelta(days=1)).alias('published_time_diff')
    )
    _df_history = _df_history.groupby(["user_id"]).agg([
        pl.col("published_time_diff").mean().alias("mean_history_published_time_diff"),
        pl.col("published_time_diff").max().alias("max_history_published_time_diff"),
        pl.col("published_time_diff").min().alias("min_history_published_time_diff"),
        pl.col("published_time_diff").median().alias("median_history_published_time_diff"),
    ])
    
    df = df.join(_df_history, how="left", on="user_id")
    return df

## Run Feature Engineering

In [32]:
def create_feature(
        df: pl.DataFrame,
        df_history: pl.DataFrame,
        mode: str = 'train',
        logging: bool = True
) -> pl.DataFrame:
    
    if logging:
        logger.info('start:feature_engineering')
        logger.info(f'df shape: {df.shape}')
        current_time = datetime.now()
                
    df = feat_clicked_history_count(df,df_history)
    df = feat_clicked_history_count_by_article(df,df_history)

    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'clicked_history_count/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    df = feat_inview_cooccur(df,mode)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'inview_cooccur/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    df = feat_article_simple(df,articles)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'feat_article_simple/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    df = feat_user_last_impression_publish_time_diff(df,df_history,articles)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'user_last_impression_publish_time_diff/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
                    
    df = feat_article_pop(df,mode,df_pop_path='/home/data/article_pop_inview/',metacol=None)
    
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'article_pop/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    df = add_past_category_ratios(df,df_history)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'add_past_category_ratios/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
        
    df = add_next_impression_id_cossim(df, common_vec_dict["multilingual-e5-large-instruct"], method="mean")
    df = add_next_impression_id_cossim(df, common_vec_dict["multilingual-e5-large-instruct"], method="min")
    df = add_next_impression_id_cossim(df, common_vec_dict["multilingual-e5-large-instruct"], method="max")
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'add_next_impression_id_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    for k,v in common_vec_dict.items():
        for type in ['mean','scroll_mean','read_time_mean']:#'min', 'max', 
            for window in [1,0]:
                if window == 1 and type != 'mean':
                    continue

                df = feat_clicked_cossim(df,df_history,v,f'click_cossim_{k}_{type}_{window}',type,window)
            
            if logging:
                logger.info(f'k: {k}, type: {type}')
                elapsed_seconds = (datetime.now() - current_time).seconds
                current_time = datetime.now()
                logger.info(f'clicked_inview_common_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
                
            df = feat_clicked_category_cossim(df,df_history,v,f'category_click_cossim_{k}_{type}',type,window=0)
            if logging:
                logger.info(f'k: {k}, type: {type}')
                elapsed_seconds = (datetime.now() - current_time).seconds
                current_time = datetime.now()
                logger.info(f'category_click_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

        df = feat_inview_cossim(df,v,f'inview_cossim_{k}')

        if logging:
            logger.info(f'k: {k}')
            elapsed_seconds = (datetime.now() - current_time).seconds
            current_time = datetime.now()
            logger.info(f'inview_my_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

        df = add_cossim_individual_features(df, df_history, v, f"{k}_individual")
        if logging:
            logger.info(f'k: {k}')
            elapsed_seconds = (datetime.now() - current_time).seconds
            current_time = datetime.now()
            logger.info(f'add_cossim_individual_features/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
    

    for k,v in my_vec_dict.items():
        for type in ['mean','scroll_mean','read_time_mean']:
            for window in [1,2,3,0]:
                if window == 1 and type != 'mean':
                    continue
                #clickベース
                df = feat_clicked_cossim(df,df_history,v[mode],f'click_cossim_{k}_{type}_{window}',type,window)

                if logging:
                    logger.info(f'k: {k}, type: {type}')
                    elapsed_seconds = (datetime.now() - current_time).seconds
                    current_time = datetime.now()
                    logger.info(f'clicked_inview_my_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    cat_cols = ["device_type", "is_subscriber", "is_sso_user"]
    for k,v in common_vec_dict.items():
        if k == 'one_hot':
            continue
        for cat_col in cat_cols:
            df = add_cossim_by_category(
                df,
                df_history.rename(
                    {"impression_time_fixed": "impression_time",
                     "scroll_percentage_fixed": "scroll_percentage",
                     "article_id_fixed": "article_id",
                     "read_time_fixed": "read_time"}
                ).explode(
                    ["impression_time", "scroll_percentage", "article_id", "read_time"]
                ).sort(
                    ["user_id", "impression_time"]
                ),
                v,
                f"cossim__{k}__{cat_col}", cat_col)
        
            if logging:
                logger.info(f'k: {k}, cat_col: {cat_col}')
                elapsed_seconds = (datetime.now() - current_time).seconds
                current_time = datetime.now()
                logger.info(f'add_cossim_by_category/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
        
    df = calculate_inview_counts(df)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'calculate_inview_counts/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    for col in df.columns:
        if df[col].dtype == pl.Boolean:
            df = df.with_columns(pl.col(col).cast(pl.Int32))
            
    df = add_statistic(df)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'statistic/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
    
    df = df.with_columns([
        (pl.col('total_pageviews') / pl.col('total_inviews')).alias('pageviews/inviews'),
        (pl.col('total_read_time') / pl.col('total_inviews')).alias('read_time/inviews'),
        pl.col('published_time_diff').rank().over('impression_id').alias('published_time_diff_rank'),
    ])

    rank_cols = ['total_inviews_norm', 'total_pageviews_norm', 'total_read_time_norm', 'rounded_1m_inview_count', 'rounded_2m_inview_count',
                 'rounded_3m_inview_count','rounded_5m_inview_count','rounded_10m_inview_count','rounded_15m_inview_count','rounded_20m_inview_count',
                 'rounded_30m_inview_count','rounded_1h_inview_count','rounded_3h_inview_count','rounded_6h_inview_count','rounded_12h_inview_count','rounded_24h_inview_count',
                 'inview_count', 'past_inview_count', 'time_gap_to_next_inview_impression_time', 'time_gap_from_prev_inview_impression_time', 
                 'next_impression_id_mean_cossim', 'next_impression_id_min_cossim', 'next_impression_id_max_cossim', 
                 'published_time_diff','published_time_diff_min-published_time_diff'
                ]
    for col in rank_cols:
        df = df.with_columns(pl.col(col).rank(descending =True).over('impression_id').alias(f'{col}_rank'))

    df = df.with_columns([
        df['impression_time'].dt.hour().alias('hour'),
        df['impression_time'].dt.minute().alias('minute'),
        df['impression_time'].dt.second().alias('second'),
        df['impression_time'].dt.weekday().alias('weekday')
    ])    
    
    df = normalize_rank_by_percentage(df)
    
    grouped_df = df.groupby("article_id").agg(
        [
            pl.col("count_article_ids_inview").mean().alias("mean_count_article_ids_inview"),
            pl.col("count_article_ids_inview").min().alias("min_count_article_ids_inview"),
            pl.col("count_article_ids_inview").median().alias("median_count_article_ids_inview"),
        ]
    )
    df = df.join(grouped_df, on=["article_id"], how="left")
    
    df = add_published_time_features(df, df_history)

    return df

# Train

In [33]:
n_chunks = 100

trn_df = get_target_df(trn_behaviors)
unique_user_ids = trn_df["user_id"].unique().to_numpy()

np.random.shuffle(unique_user_ids)
user_id_splits = np.array_split(unique_user_ids, n_chunks)

for _chunk in range(n_chunks):
    
    logger.info(f"Starting Chunk {_chunk}")

    df_chunk = trn_df.filter(pl.col("user_id").is_in(set(user_id_splits[_chunk])))
    df_history_chunk = trn_history.filter(pl.col("user_id").is_in(set(user_id_splits[_chunk])))
    df_chunk = create_feature(df_chunk,df_history_chunk,mode='train')
    
    df_chunk.write_parquet(f"./trn_large_chunks/trn_df_chunk{_chunk}.parquet")
    
    del df_chunk
    import gc; gc.collect()

[2024-06-17 00:33:55,133][INFO] Starting Chunk 0
[2024-06-17 00:33:56,755][INFO] start:feature_engineering
[2024-06-17 00:33:56,756][INFO] df shape: (1363705, 15)
[2024-06-17 00:33:58,185][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (1363705, 18)
[2024-06-17 00:34:00,327][INFO] inview_cooccur/elapsed_seconds: 2/df shape: (1363705, 28)
[2024-06-17 00:34:00,559][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1363705, 43)
[2024-06-17 00:34:00,673][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1363705, 44)
[2024-06-17 00:34:04,482][INFO] article_pop/elapsed_seconds: 3/df shape: (1363705, 72)
[2024-06-17 00:34:09,005][INFO] add_past_category_ratios/elapsed_seconds: 4/df shape: (1363705, 75)
[2024-06-17 00:34:23,663][INFO] add_next_impression_id_cossim/elapsed_seconds: 14/df shape: (1363705, 78)
[2024-06-17 00:34:36,420][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 00:34:36,425][INFO] clicked_inview_common_cossim/elapsed_s

[2024-06-17 00:41:23,237][INFO] k: item2vec, type: read_time_mean
[2024-06-17 00:41:23,238][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1356521, 105)
[2024-06-17 00:41:24,704][INFO] k: item2vec, type: read_time_mean
[2024-06-17 00:41:24,705][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1356521, 106)
[2024-06-17 00:41:42,366][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 00:41:42,367][INFO] add_cossim_by_category/elapsed_seconds: 17/df shape: (1356521, 107)
[2024-06-17 00:41:58,243][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 00:41:58,245][INFO] add_cossim_by_category/elapsed_seconds: 15/df shape: (1356521, 108)
[2024-06-17 00:42:14,324][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 00:42:14,325][INFO] add_cossim_by_category/elapsed_seconds: 16/df shape: (1356521, 109)
[2024-06-17 00:42:18,438][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1356521, 113)


[2024-06-17 00:50:02,997][INFO] k: item2vec, type: mean
[2024-06-17 00:50:02,998][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1323710, 98)
[2024-06-17 00:50:04,302][INFO] k: item2vec, type: mean
[2024-06-17 00:50:04,303][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1323710, 99)
[2024-06-17 00:50:05,494][INFO] k: item2vec, type: mean
[2024-06-17 00:50:05,495][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1323710, 100)
[2024-06-17 00:50:06,801][INFO] k: item2vec, type: scroll_mean
[2024-06-17 00:50:06,802][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1323710, 101)
[2024-06-17 00:50:08,081][INFO] k: item2vec, type: scroll_mean
[2024-06-17 00:50:08,082][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1323710, 102)
[2024-06-17 00:50:09,624][INFO] k: item2vec, type: scroll_mean
[2024-06-17 00:50:09,625][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1323710, 103)
[2024-06-17 00:50:10,956][INFO] k: item

[2024-06-17 00:57:18,698][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 00:57:18,700][INFO] category_click_cossim/elapsed_seconds: 11/df shape: (1356663, 83)
[2024-06-17 00:57:28,680][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 00:57:28,681][INFO] clicked_inview_common_cossim/elapsed_seconds: 9/df shape: (1356663, 84)
[2024-06-17 00:57:39,592][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 00:57:39,593][INFO] category_click_cossim/elapsed_seconds: 10/df shape: (1356663, 85)
[2024-06-17 00:57:59,043][INFO] k: multilingual-e5-large-instruct
[2024-06-17 00:57:59,044][INFO] inview_my_cossim/elapsed_seconds: 19/df shape: (1356663, 88)
[2024-06-17 00:58:53,251][INFO] k: multilingual-e5-large-instruct
[2024-06-17 00:58:53,252][INFO] add_cossim_individual_features/elapsed_seconds: 54/df shape: (1356663, 96)
[2024-06-17 00:58:54,698][INFO] k: item2vec, type: mean
[2024-06-17 00:58:54,699][INFO] clicked_inview_my_c

[2024-06-17 01:05:10,583][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1328320, 43)
[2024-06-17 01:05:10,698][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1328320, 44)
[2024-06-17 01:05:14,547][INFO] article_pop/elapsed_seconds: 3/df shape: (1328320, 72)
[2024-06-17 01:05:18,920][INFO] add_past_category_ratios/elapsed_seconds: 4/df shape: (1328320, 75)
[2024-06-17 01:05:33,139][INFO] add_next_impression_id_cossim/elapsed_seconds: 14/df shape: (1328320, 78)
[2024-06-17 01:05:45,666][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 01:05:45,668][INFO] clicked_inview_common_cossim/elapsed_seconds: 12/df shape: (1328320, 80)
[2024-06-17 01:05:52,576][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 01:05:52,578][INFO] category_click_cossim/elapsed_seconds: 6/df shape: (1328320, 81)
[2024-06-17 01:06:02,966][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 01:06:02,969][INFO] clicked_inview_common_c

[2024-06-17 01:12:42,317][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 01:12:42,318][INFO] add_cossim_by_category/elapsed_seconds: 17/df shape: (1338892, 107)
[2024-06-17 01:12:57,824][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 01:12:57,825][INFO] add_cossim_by_category/elapsed_seconds: 15/df shape: (1338892, 108)
[2024-06-17 01:13:13,558][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 01:13:13,560][INFO] add_cossim_by_category/elapsed_seconds: 15/df shape: (1338892, 109)
[2024-06-17 01:13:17,613][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1338892, 113)
[2024-06-17 01:13:21,758][INFO] statistic/elapsed_seconds: 4/df shape: (1338892, 404)
[2024-06-17 01:13:52,988][INFO] Starting Chunk 9
[2024-06-17 01:13:54,591][INFO] start:feature_engineering
[2024-06-17 01:13:54,592][INFO] df shape: (1312535, 15)
[2024-06-17 01:13:55,987][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (1

[2024-06-17 01:21:54,040][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1343072, 100)
[2024-06-17 01:21:55,382][INFO] k: item2vec, type: scroll_mean
[2024-06-17 01:21:55,384][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1343072, 101)
[2024-06-17 01:21:56,780][INFO] k: item2vec, type: scroll_mean
[2024-06-17 01:21:56,782][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1343072, 102)
[2024-06-17 01:21:58,339][INFO] k: item2vec, type: scroll_mean
[2024-06-17 01:21:58,340][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1343072, 103)
[2024-06-17 01:21:59,731][INFO] k: item2vec, type: read_time_mean
[2024-06-17 01:21:59,732][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1343072, 104)
[2024-06-17 01:22:01,132][INFO] k: item2vec, type: read_time_mean
[2024-06-17 01:22:01,133][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1343072, 105)
[2024-06-17 01:22:02,699][INFO] k: item2vec, type: read_time_mean
[2024-0

[2024-06-17 01:30:38,828][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 01:30:38,829][INFO] category_click_cossim/elapsed_seconds: 15/df shape: (1307047, 85)
[2024-06-17 01:31:01,432][INFO] k: multilingual-e5-large-instruct
[2024-06-17 01:31:01,434][INFO] inview_my_cossim/elapsed_seconds: 22/df shape: (1307047, 88)
[2024-06-17 01:32:03,981][INFO] k: multilingual-e5-large-instruct
[2024-06-17 01:32:03,983][INFO] add_cossim_individual_features/elapsed_seconds: 62/df shape: (1307047, 96)
[2024-06-17 01:32:05,416][INFO] k: item2vec, type: mean
[2024-06-17 01:32:05,417][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1307047, 97)
[2024-06-17 01:32:06,664][INFO] k: item2vec, type: mean
[2024-06-17 01:32:06,665][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1307047, 98)
[2024-06-17 01:32:07,917][INFO] k: item2vec, type: mean
[2024-06-17 01:32:07,918][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1307047, 99)
[2024-06-17 01

[2024-06-17 01:39:40,733][INFO] add_past_category_ratios/elapsed_seconds: 4/df shape: (1325501, 75)
[2024-06-17 01:39:57,285][INFO] add_next_impression_id_cossim/elapsed_seconds: 16/df shape: (1325501, 78)
[2024-06-17 01:40:12,964][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 01:40:12,965][INFO] clicked_inview_common_cossim/elapsed_seconds: 15/df shape: (1325501, 80)
[2024-06-17 01:40:20,600][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 01:40:20,601][INFO] category_click_cossim/elapsed_seconds: 7/df shape: (1325501, 81)
[2024-06-17 01:40:32,316][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 01:40:32,318][INFO] clicked_inview_common_cossim/elapsed_seconds: 11/df shape: (1325501, 82)
[2024-06-17 01:40:44,406][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 01:40:44,407][INFO] category_click_cossim/elapsed_seconds: 12/df shape: (1325501, 83)
[2024-06-17 01:40:55,015][INFO] k: multilingual-e5-large-instru

[2024-06-17 01:48:36,760][INFO] add_cossim_by_category/elapsed_seconds: 20/df shape: (1332832, 108)
[2024-06-17 01:48:53,556][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 01:48:53,558][INFO] add_cossim_by_category/elapsed_seconds: 16/df shape: (1332832, 109)
[2024-06-17 01:48:57,709][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1332832, 113)
[2024-06-17 01:49:01,916][INFO] statistic/elapsed_seconds: 4/df shape: (1332832, 404)
[2024-06-17 01:49:34,012][INFO] Starting Chunk 16
[2024-06-17 01:49:35,628][INFO] start:feature_engineering
[2024-06-17 01:49:35,629][INFO] df shape: (1360448, 15)
[2024-06-17 01:49:37,097][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (1360448, 18)
[2024-06-17 01:49:39,327][INFO] inview_cooccur/elapsed_seconds: 2/df shape: (1360448, 28)
[2024-06-17 01:49:39,569][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1360448, 43)
[2024-06-17 01:49:39,687][INFO] user_last_impression_publish_time_diff/elapsed_s

[2024-06-17 01:58:19,155][INFO] k: item2vec, type: scroll_mean
[2024-06-17 01:58:19,156][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1350821, 102)
[2024-06-17 01:58:20,697][INFO] k: item2vec, type: scroll_mean
[2024-06-17 01:58:20,698][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1350821, 103)
[2024-06-17 01:58:22,139][INFO] k: item2vec, type: read_time_mean
[2024-06-17 01:58:22,140][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1350821, 104)
[2024-06-17 01:58:23,418][INFO] k: item2vec, type: read_time_mean
[2024-06-17 01:58:23,419][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1350821, 105)
[2024-06-17 01:58:24,957][INFO] k: item2vec, type: read_time_mean
[2024-06-17 01:58:24,958][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1350821, 106)
[2024-06-17 01:58:47,476][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 01:58:47,479][INFO] add_cossim_by_category/elapsed_seconds: 22/df shape

[2024-06-17 02:07:36,512][INFO] k: multilingual-e5-large-instruct
[2024-06-17 02:07:36,515][INFO] inview_my_cossim/elapsed_seconds: 24/df shape: (1309373, 88)
[2024-06-17 02:08:35,664][INFO] k: multilingual-e5-large-instruct
[2024-06-17 02:08:35,665][INFO] add_cossim_individual_features/elapsed_seconds: 59/df shape: (1309373, 96)
[2024-06-17 02:08:37,194][INFO] k: item2vec, type: mean
[2024-06-17 02:08:37,195][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1309373, 97)
[2024-06-17 02:08:38,499][INFO] k: item2vec, type: mean
[2024-06-17 02:08:38,500][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1309373, 98)
[2024-06-17 02:08:39,748][INFO] k: item2vec, type: mean
[2024-06-17 02:08:39,750][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1309373, 99)
[2024-06-17 02:08:41,062][INFO] k: item2vec, type: mean
[2024-06-17 02:08:41,064][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1309373, 100)
[2024-06-17 02:08:42,616][INFO] k: item2vec

[2024-06-17 02:16:32,439][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 02:16:32,441][INFO] clicked_inview_common_cossim/elapsed_seconds: 15/df shape: (1342089, 80)
[2024-06-17 02:16:41,183][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 02:16:41,185][INFO] category_click_cossim/elapsed_seconds: 8/df shape: (1342089, 81)
[2024-06-17 02:16:54,375][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 02:16:54,378][INFO] clicked_inview_common_cossim/elapsed_seconds: 13/df shape: (1342089, 82)
[2024-06-17 02:17:07,958][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 02:17:07,960][INFO] category_click_cossim/elapsed_seconds: 13/df shape: (1342089, 83)
[2024-06-17 02:17:21,790][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 02:17:21,791][INFO] clicked_inview_common_cossim/elapsed_seconds: 13/df shape: (1342089, 84)
[2024-06-17 02:17:36,657][INFO] k: multilingual-e5-large-instruct, type: rea

[2024-06-17 02:25:31,176][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1362611, 113)
[2024-06-17 02:25:35,855][INFO] statistic/elapsed_seconds: 4/df shape: (1362611, 404)
[2024-06-17 02:26:13,969][INFO] Starting Chunk 23
[2024-06-17 02:26:15,623][INFO] start:feature_engineering
[2024-06-17 02:26:15,625][INFO] df shape: (1348219, 15)
[2024-06-17 02:26:17,105][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (1348219, 18)
[2024-06-17 02:26:19,516][INFO] inview_cooccur/elapsed_seconds: 2/df shape: (1348219, 28)
[2024-06-17 02:26:19,761][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1348219, 43)
[2024-06-17 02:26:19,876][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1348219, 44)
[2024-06-17 02:26:23,825][INFO] article_pop/elapsed_seconds: 3/df shape: (1348219, 72)
[2024-06-17 02:26:28,581][INFO] add_past_category_ratios/elapsed_seconds: 4/df shape: (1348219, 75)
[2024-06-17 02:26:48,977][INFO] add_next_impression_id_cossim/elaps

[2024-06-17 02:35:06,887][INFO] k: item2vec, type: read_time_mean
[2024-06-17 02:35:06,889][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1375439, 104)
[2024-06-17 02:35:08,644][INFO] k: item2vec, type: read_time_mean
[2024-06-17 02:35:08,647][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1375439, 105)
[2024-06-17 02:35:10,271][INFO] k: item2vec, type: read_time_mean
[2024-06-17 02:35:10,272][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1375439, 106)
[2024-06-17 02:35:34,193][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 02:35:34,194][INFO] add_cossim_by_category/elapsed_seconds: 23/df shape: (1375439, 107)
[2024-06-17 02:35:53,312][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 02:35:53,314][INFO] add_cossim_by_category/elapsed_seconds: 19/df shape: (1375439, 108)
[2024-06-17 02:36:09,120][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 02:36:09,121][INFO] a

[2024-06-17 02:45:28,017][INFO] k: item2vec, type: mean
[2024-06-17 02:45:28,018][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1330597, 97)
[2024-06-17 02:45:29,304][INFO] k: item2vec, type: mean
[2024-06-17 02:45:29,305][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1330597, 98)
[2024-06-17 02:45:30,641][INFO] k: item2vec, type: mean
[2024-06-17 02:45:30,642][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1330597, 99)
[2024-06-17 02:45:31,896][INFO] k: item2vec, type: mean
[2024-06-17 02:45:31,897][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1330597, 100)
[2024-06-17 02:45:33,208][INFO] k: item2vec, type: scroll_mean
[2024-06-17 02:45:33,209][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1330597, 101)
[2024-06-17 02:45:34,563][INFO] k: item2vec, type: scroll_mean
[2024-06-17 02:45:34,565][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1330597, 102)
[2024-06-17 02:45:36,208][INFO] k: item2vec, ty

[2024-06-17 02:54:00,827][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 02:54:00,828][INFO] clicked_inview_common_cossim/elapsed_seconds: 12/df shape: (1327331, 82)
[2024-06-17 02:54:15,423][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 02:54:15,424][INFO] category_click_cossim/elapsed_seconds: 14/df shape: (1327331, 83)
[2024-06-17 02:54:26,082][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 02:54:26,083][INFO] clicked_inview_common_cossim/elapsed_seconds: 10/df shape: (1327331, 84)
[2024-06-17 02:54:41,707][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 02:54:41,708][INFO] category_click_cossim/elapsed_seconds: 15/df shape: (1327331, 85)
[2024-06-17 02:55:02,695][INFO] k: multilingual-e5-large-instruct
[2024-06-17 02:55:02,698][INFO] inview_my_cossim/elapsed_seconds: 20/df shape: (1327331, 88)
[2024-06-17 02:55:57,282][INFO] k: multilingual-e5-large-instruct
[2024-06-17 02:55:57,28

[2024-06-17 03:03:14,029][INFO] df shape: (1335014, 15)
[2024-06-17 03:03:15,577][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (1335014, 18)
[2024-06-17 03:03:17,983][INFO] inview_cooccur/elapsed_seconds: 2/df shape: (1335014, 28)
[2024-06-17 03:03:18,215][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1335014, 43)
[2024-06-17 03:03:18,328][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1335014, 44)
[2024-06-17 03:03:22,133][INFO] article_pop/elapsed_seconds: 3/df shape: (1335014, 72)
[2024-06-17 03:03:27,191][INFO] add_past_category_ratios/elapsed_seconds: 5/df shape: (1335014, 75)
[2024-06-17 03:03:42,290][INFO] add_next_impression_id_cossim/elapsed_seconds: 15/df shape: (1335014, 78)
[2024-06-17 03:03:55,591][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 03:03:55,593][INFO] clicked_inview_common_cossim/elapsed_seconds: 13/df shape: (1335014, 80)
[2024-06-17 03:04:03,675][INFO] k: multilingual-e5-large-instruct, type:

[2024-06-17 03:12:02,460][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1339341, 105)
[2024-06-17 03:12:04,025][INFO] k: item2vec, type: read_time_mean
[2024-06-17 03:12:04,026][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1339341, 106)
[2024-06-17 03:12:22,158][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 03:12:22,159][INFO] add_cossim_by_category/elapsed_seconds: 18/df shape: (1339341, 107)
[2024-06-17 03:12:38,705][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 03:12:38,706][INFO] add_cossim_by_category/elapsed_seconds: 16/df shape: (1339341, 108)
[2024-06-17 03:12:56,916][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 03:12:56,917][INFO] add_cossim_by_category/elapsed_seconds: 18/df shape: (1339341, 109)
[2024-06-17 03:13:01,082][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1339341, 113)
[2024-06-17 03:13:05,960][INFO] statistic/elapsed_seconds: 4/df sh

[2024-06-17 03:22:30,535][INFO] k: item2vec, type: mean
[2024-06-17 03:22:30,536][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1348447, 99)
[2024-06-17 03:22:31,847][INFO] k: item2vec, type: mean
[2024-06-17 03:22:31,848][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1348447, 100)
[2024-06-17 03:22:33,333][INFO] k: item2vec, type: scroll_mean
[2024-06-17 03:22:33,334][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1348447, 101)
[2024-06-17 03:22:34,599][INFO] k: item2vec, type: scroll_mean
[2024-06-17 03:22:34,600][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1348447, 102)
[2024-06-17 03:22:36,128][INFO] k: item2vec, type: scroll_mean
[2024-06-17 03:22:36,129][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1348447, 103)
[2024-06-17 03:22:37,635][INFO] k: item2vec, type: read_time_mean
[2024-06-17 03:22:37,636][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1348447, 104)
[2024-06-17 03:22:38,932][IN

[2024-06-17 03:31:02,109][INFO] category_click_cossim/elapsed_seconds: 14/df shape: (1347250, 83)
[2024-06-17 03:31:15,140][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 03:31:15,141][INFO] clicked_inview_common_cossim/elapsed_seconds: 13/df shape: (1347250, 84)
[2024-06-17 03:31:29,292][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 03:31:29,293][INFO] category_click_cossim/elapsed_seconds: 14/df shape: (1347250, 85)
[2024-06-17 03:31:54,188][INFO] k: multilingual-e5-large-instruct
[2024-06-17 03:31:54,190][INFO] inview_my_cossim/elapsed_seconds: 24/df shape: (1347250, 88)
[2024-06-17 03:33:02,955][INFO] k: multilingual-e5-large-instruct
[2024-06-17 03:33:02,957][INFO] add_cossim_individual_features/elapsed_seconds: 68/df shape: (1347250, 96)
[2024-06-17 03:33:04,461][INFO] k: item2vec, type: mean
[2024-06-17 03:33:04,462][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347250, 97)
[2024-06-17 03:33:05,706][INFO] k: 

[2024-06-17 03:40:10,941][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1333602, 43)
[2024-06-17 03:40:11,068][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1333602, 44)
[2024-06-17 03:40:15,652][INFO] article_pop/elapsed_seconds: 4/df shape: (1333602, 72)
[2024-06-17 03:40:21,890][INFO] add_past_category_ratios/elapsed_seconds: 6/df shape: (1333602, 75)
[2024-06-17 03:40:42,641][INFO] add_next_impression_id_cossim/elapsed_seconds: 20/df shape: (1333602, 78)
[2024-06-17 03:40:58,824][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 03:40:58,825][INFO] clicked_inview_common_cossim/elapsed_seconds: 16/df shape: (1333602, 80)
[2024-06-17 03:41:07,139][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 03:41:07,140][INFO] category_click_cossim/elapsed_seconds: 8/df shape: (1333602, 81)
[2024-06-17 03:41:17,842][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 03:41:17,843][INFO] clicked_inview_common_c

[2024-06-17 03:49:25,858][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 03:49:25,860][INFO] add_cossim_by_category/elapsed_seconds: 19/df shape: (1350370, 107)
[2024-06-17 03:49:44,592][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 03:49:44,594][INFO] add_cossim_by_category/elapsed_seconds: 18/df shape: (1350370, 108)
[2024-06-17 03:50:04,925][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 03:50:04,927][INFO] add_cossim_by_category/elapsed_seconds: 20/df shape: (1350370, 109)
[2024-06-17 03:50:09,712][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1350370, 113)
[2024-06-17 03:50:15,415][INFO] statistic/elapsed_seconds: 5/df shape: (1350370, 404)
[2024-06-17 03:50:50,361][INFO] Starting Chunk 39
[2024-06-17 03:50:51,968][INFO] start:feature_engineering
[2024-06-17 03:50:51,969][INFO] df shape: (1326431, 15)
[2024-06-17 03:50:53,415][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (

[2024-06-17 03:59:30,215][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1383360, 100)
[2024-06-17 03:59:31,515][INFO] k: item2vec, type: scroll_mean
[2024-06-17 03:59:31,516][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1383360, 101)
[2024-06-17 03:59:32,739][INFO] k: item2vec, type: scroll_mean
[2024-06-17 03:59:32,740][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1383360, 102)
[2024-06-17 03:59:34,233][INFO] k: item2vec, type: scroll_mean
[2024-06-17 03:59:34,234][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1383360, 103)
[2024-06-17 03:59:35,594][INFO] k: item2vec, type: read_time_mean
[2024-06-17 03:59:35,596][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1383360, 104)
[2024-06-17 03:59:37,055][INFO] k: item2vec, type: read_time_mean
[2024-06-17 03:59:37,056][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1383360, 105)
[2024-06-17 03:59:38,573][INFO] k: item2vec, type: read_time_mean
[2024-0

[2024-06-17 04:08:29,244][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 04:08:29,245][INFO] category_click_cossim/elapsed_seconds: 11/df shape: (1326304, 85)
[2024-06-17 04:08:49,916][INFO] k: multilingual-e5-large-instruct
[2024-06-17 04:08:49,918][INFO] inview_my_cossim/elapsed_seconds: 20/df shape: (1326304, 88)
[2024-06-17 04:09:56,474][INFO] k: multilingual-e5-large-instruct
[2024-06-17 04:09:56,475][INFO] add_cossim_individual_features/elapsed_seconds: 66/df shape: (1326304, 96)
[2024-06-17 04:09:58,054][INFO] k: item2vec, type: mean
[2024-06-17 04:09:58,055][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1326304, 97)
[2024-06-17 04:09:59,536][INFO] k: item2vec, type: mean
[2024-06-17 04:09:59,537][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1326304, 98)
[2024-06-17 04:10:00,811][INFO] k: item2vec, type: mean
[2024-06-17 04:10:00,812][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1326304, 99)
[2024-06-17 04

[2024-06-17 04:17:23,986][INFO] add_past_category_ratios/elapsed_seconds: 5/df shape: (1327758, 75)
[2024-06-17 04:17:42,191][INFO] add_next_impression_id_cossim/elapsed_seconds: 18/df shape: (1327758, 78)
[2024-06-17 04:17:59,167][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 04:17:59,168][INFO] clicked_inview_common_cossim/elapsed_seconds: 16/df shape: (1327758, 80)
[2024-06-17 04:18:06,777][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 04:18:06,779][INFO] category_click_cossim/elapsed_seconds: 7/df shape: (1327758, 81)
[2024-06-17 04:18:20,030][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 04:18:20,031][INFO] clicked_inview_common_cossim/elapsed_seconds: 13/df shape: (1327758, 82)
[2024-06-17 04:18:34,740][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 04:18:34,741][INFO] category_click_cossim/elapsed_seconds: 14/df shape: (1327758, 83)
[2024-06-17 04:18:46,667][INFO] k: multilingual-e5-large-instru

[2024-06-17 04:26:33,417][INFO] add_cossim_by_category/elapsed_seconds: 18/df shape: (1334398, 108)
[2024-06-17 04:26:52,487][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 04:26:52,488][INFO] add_cossim_by_category/elapsed_seconds: 19/df shape: (1334398, 109)
[2024-06-17 04:26:56,615][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1334398, 113)
[2024-06-17 04:27:00,925][INFO] statistic/elapsed_seconds: 4/df shape: (1334398, 404)
[2024-06-17 04:27:37,869][INFO] Starting Chunk 46
[2024-06-17 04:27:39,826][INFO] start:feature_engineering
[2024-06-17 04:27:39,827][INFO] df shape: (1321767, 15)
[2024-06-17 04:27:41,551][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (1321767, 18)
[2024-06-17 04:27:44,114][INFO] inview_cooccur/elapsed_seconds: 2/df shape: (1321767, 28)
[2024-06-17 04:27:44,412][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1321767, 43)
[2024-06-17 04:27:44,525][INFO] user_last_impression_publish_time_diff/elapsed_s

[2024-06-17 04:36:15,518][INFO] k: item2vec, type: scroll_mean
[2024-06-17 04:36:15,519][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1311620, 102)
[2024-06-17 04:36:17,108][INFO] k: item2vec, type: scroll_mean
[2024-06-17 04:36:17,109][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1311620, 103)
[2024-06-17 04:36:18,617][INFO] k: item2vec, type: read_time_mean
[2024-06-17 04:36:18,618][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1311620, 104)
[2024-06-17 04:36:19,972][INFO] k: item2vec, type: read_time_mean
[2024-06-17 04:36:19,973][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1311620, 105)
[2024-06-17 04:36:21,456][INFO] k: item2vec, type: read_time_mean
[2024-06-17 04:36:21,457][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1311620, 106)
[2024-06-17 04:36:41,155][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 04:36:41,156][INFO] add_cossim_by_category/elapsed_seconds: 19/df shape

[2024-06-17 04:45:41,491][INFO] k: multilingual-e5-large-instruct
[2024-06-17 04:45:41,493][INFO] inview_my_cossim/elapsed_seconds: 24/df shape: (1326697, 88)
[2024-06-17 04:46:37,954][INFO] k: multilingual-e5-large-instruct
[2024-06-17 04:46:37,955][INFO] add_cossim_individual_features/elapsed_seconds: 56/df shape: (1326697, 96)
[2024-06-17 04:46:39,584][INFO] k: item2vec, type: mean
[2024-06-17 04:46:39,585][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1326697, 97)
[2024-06-17 04:46:40,914][INFO] k: item2vec, type: mean
[2024-06-17 04:46:40,915][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1326697, 98)
[2024-06-17 04:46:42,303][INFO] k: item2vec, type: mean
[2024-06-17 04:46:42,304][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1326697, 99)
[2024-06-17 04:46:43,530][INFO] k: item2vec, type: mean
[2024-06-17 04:46:43,531][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1326697, 100)
[2024-06-17 04:46:44,793][INFO] k: item2vec

[2024-06-17 04:54:35,421][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 04:54:35,423][INFO] clicked_inview_common_cossim/elapsed_seconds: 15/df shape: (1334612, 80)
[2024-06-17 04:54:44,342][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 04:54:44,343][INFO] category_click_cossim/elapsed_seconds: 8/df shape: (1334612, 81)
[2024-06-17 04:54:57,314][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 04:54:57,316][INFO] clicked_inview_common_cossim/elapsed_seconds: 12/df shape: (1334612, 82)
[2024-06-17 04:55:14,071][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 04:55:14,072][INFO] category_click_cossim/elapsed_seconds: 16/df shape: (1334612, 83)
[2024-06-17 04:55:26,147][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 04:55:26,148][INFO] clicked_inview_common_cossim/elapsed_seconds: 12/df shape: (1334612, 84)
[2024-06-17 04:55:39,124][INFO] k: multilingual-e5-large-instruct, type: rea

[2024-06-17 05:03:41,992][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1340559, 113)
[2024-06-17 05:03:47,576][INFO] statistic/elapsed_seconds: 5/df shape: (1340559, 404)
[2024-06-17 05:04:24,222][INFO] Starting Chunk 53
[2024-06-17 05:04:25,809][INFO] start:feature_engineering
[2024-06-17 05:04:25,810][INFO] df shape: (1341160, 15)
[2024-06-17 05:04:27,271][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (1341160, 18)
[2024-06-17 05:04:29,808][INFO] inview_cooccur/elapsed_seconds: 2/df shape: (1341160, 28)
[2024-06-17 05:04:30,052][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1341160, 43)
[2024-06-17 05:04:30,170][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1341160, 44)
[2024-06-17 05:04:34,107][INFO] article_pop/elapsed_seconds: 3/df shape: (1341160, 72)
[2024-06-17 05:04:40,374][INFO] add_past_category_ratios/elapsed_seconds: 6/df shape: (1341160, 75)
[2024-06-17 05:04:57,676][INFO] add_next_impression_id_cossim/elaps

[2024-06-17 05:13:11,943][INFO] k: item2vec, type: read_time_mean
[2024-06-17 05:13:11,945][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1348796, 104)
[2024-06-17 05:13:13,678][INFO] k: item2vec, type: read_time_mean
[2024-06-17 05:13:13,679][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1348796, 105)
[2024-06-17 05:13:15,317][INFO] k: item2vec, type: read_time_mean
[2024-06-17 05:13:15,319][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1348796, 106)
[2024-06-17 05:13:33,317][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 05:13:33,318][INFO] add_cossim_by_category/elapsed_seconds: 17/df shape: (1348796, 107)
[2024-06-17 05:13:50,846][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 05:13:50,847][INFO] add_cossim_by_category/elapsed_seconds: 17/df shape: (1348796, 108)
[2024-06-17 05:14:07,988][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 05:14:07,989][INFO] a

[2024-06-17 05:23:15,811][INFO] k: item2vec, type: mean
[2024-06-17 05:23:15,812][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1349424, 97)
[2024-06-17 05:23:17,086][INFO] k: item2vec, type: mean
[2024-06-17 05:23:17,087][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1349424, 98)
[2024-06-17 05:23:18,619][INFO] k: item2vec, type: mean
[2024-06-17 05:23:18,620][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1349424, 99)
[2024-06-17 05:23:20,127][INFO] k: item2vec, type: mean
[2024-06-17 05:23:20,128][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1349424, 100)
[2024-06-17 05:23:21,565][INFO] k: item2vec, type: scroll_mean
[2024-06-17 05:23:21,566][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1349424, 101)
[2024-06-17 05:23:23,224][INFO] k: item2vec, type: scroll_mean
[2024-06-17 05:23:23,225][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1349424, 102)
[2024-06-17 05:23:25,019][INFO] k: item2vec, ty

[2024-06-17 05:31:24,823][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 05:31:24,824][INFO] clicked_inview_common_cossim/elapsed_seconds: 11/df shape: (1333491, 82)
[2024-06-17 05:31:39,360][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 05:31:39,361][INFO] category_click_cossim/elapsed_seconds: 14/df shape: (1333491, 83)
[2024-06-17 05:31:52,760][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 05:31:52,762][INFO] clicked_inview_common_cossim/elapsed_seconds: 13/df shape: (1333491, 84)
[2024-06-17 05:32:07,616][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 05:32:07,618][INFO] category_click_cossim/elapsed_seconds: 14/df shape: (1333491, 85)
[2024-06-17 05:32:31,495][INFO] k: multilingual-e5-large-instruct
[2024-06-17 05:32:31,497][INFO] inview_my_cossim/elapsed_seconds: 23/df shape: (1333491, 88)
[2024-06-17 05:33:33,432][INFO] k: multilingual-e5-large-instruct
[2024-06-17 05:33:33,43

[2024-06-17 05:40:40,264][INFO] df shape: (1348130, 15)
[2024-06-17 05:40:41,834][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (1348130, 18)
[2024-06-17 05:40:44,199][INFO] inview_cooccur/elapsed_seconds: 2/df shape: (1348130, 28)
[2024-06-17 05:40:44,460][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1348130, 43)
[2024-06-17 05:40:44,572][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1348130, 44)
[2024-06-17 05:40:48,442][INFO] article_pop/elapsed_seconds: 3/df shape: (1348130, 72)
[2024-06-17 05:40:54,981][INFO] add_past_category_ratios/elapsed_seconds: 6/df shape: (1348130, 75)
[2024-06-17 05:41:12,106][INFO] add_next_impression_id_cossim/elapsed_seconds: 17/df shape: (1348130, 78)
[2024-06-17 05:41:27,367][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 05:41:27,368][INFO] clicked_inview_common_cossim/elapsed_seconds: 15/df shape: (1348130, 80)
[2024-06-17 05:41:37,007][INFO] k: multilingual-e5-large-instruct, type:

[2024-06-17 05:49:35,321][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1332412, 105)
[2024-06-17 05:49:36,828][INFO] k: item2vec, type: read_time_mean
[2024-06-17 05:49:36,829][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1332412, 106)
[2024-06-17 05:49:57,621][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 05:49:57,622][INFO] add_cossim_by_category/elapsed_seconds: 20/df shape: (1332412, 107)
[2024-06-17 05:50:16,436][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 05:50:16,438][INFO] add_cossim_by_category/elapsed_seconds: 18/df shape: (1332412, 108)
[2024-06-17 05:50:36,885][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 05:50:36,886][INFO] add_cossim_by_category/elapsed_seconds: 20/df shape: (1332412, 109)
[2024-06-17 05:50:41,046][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1332412, 113)
[2024-06-17 05:50:46,208][INFO] statistic/elapsed_seconds: 5/df sh

[2024-06-17 06:00:05,714][INFO] k: item2vec, type: mean
[2024-06-17 06:00:05,717][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1336180, 99)
[2024-06-17 06:00:07,362][INFO] k: item2vec, type: mean
[2024-06-17 06:00:07,363][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1336180, 100)
[2024-06-17 06:00:08,648][INFO] k: item2vec, type: scroll_mean
[2024-06-17 06:00:08,649][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1336180, 101)
[2024-06-17 06:00:10,018][INFO] k: item2vec, type: scroll_mean
[2024-06-17 06:00:10,020][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1336180, 102)
[2024-06-17 06:00:11,735][INFO] k: item2vec, type: scroll_mean
[2024-06-17 06:00:11,737][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1336180, 103)
[2024-06-17 06:00:13,062][INFO] k: item2vec, type: read_time_mean
[2024-06-17 06:00:13,064][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1336180, 104)
[2024-06-17 06:00:14,349][IN

[2024-06-17 06:08:34,584][INFO] category_click_cossim/elapsed_seconds: 14/df shape: (1377740, 83)
[2024-06-17 06:08:46,347][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 06:08:46,348][INFO] clicked_inview_common_cossim/elapsed_seconds: 11/df shape: (1377740, 84)
[2024-06-17 06:08:57,481][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 06:08:57,482][INFO] category_click_cossim/elapsed_seconds: 11/df shape: (1377740, 85)
[2024-06-17 06:09:19,701][INFO] k: multilingual-e5-large-instruct
[2024-06-17 06:09:19,702][INFO] inview_my_cossim/elapsed_seconds: 22/df shape: (1377740, 88)
[2024-06-17 06:10:33,688][INFO] k: multilingual-e5-large-instruct
[2024-06-17 06:10:33,692][INFO] add_cossim_individual_features/elapsed_seconds: 73/df shape: (1377740, 96)
[2024-06-17 06:10:35,228][INFO] k: item2vec, type: mean
[2024-06-17 06:10:35,229][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1377740, 97)
[2024-06-17 06:10:36,631][INFO] k: 

[2024-06-17 06:17:53,882][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1352955, 43)
[2024-06-17 06:17:54,023][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1352955, 44)
[2024-06-17 06:17:58,688][INFO] article_pop/elapsed_seconds: 4/df shape: (1352955, 72)
[2024-06-17 06:18:03,636][INFO] add_past_category_ratios/elapsed_seconds: 4/df shape: (1352955, 75)
[2024-06-17 06:18:20,040][INFO] add_next_impression_id_cossim/elapsed_seconds: 16/df shape: (1352955, 78)
[2024-06-17 06:18:36,514][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 06:18:36,515][INFO] clicked_inview_common_cossim/elapsed_seconds: 16/df shape: (1352955, 80)
[2024-06-17 06:18:44,190][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 06:18:44,191][INFO] category_click_cossim/elapsed_seconds: 7/df shape: (1352955, 81)
[2024-06-17 06:18:59,014][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 06:18:59,015][INFO] clicked_inview_common_c

[2024-06-17 06:27:01,487][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 06:27:01,489][INFO] add_cossim_by_category/elapsed_seconds: 18/df shape: (1334736, 107)
[2024-06-17 06:27:18,626][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 06:27:18,629][INFO] add_cossim_by_category/elapsed_seconds: 17/df shape: (1334736, 108)
[2024-06-17 06:27:35,354][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 06:27:35,355][INFO] add_cossim_by_category/elapsed_seconds: 16/df shape: (1334736, 109)
[2024-06-17 06:27:39,420][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1334736, 113)
[2024-06-17 06:27:43,658][INFO] statistic/elapsed_seconds: 4/df shape: (1334736, 404)
[2024-06-17 06:28:16,484][INFO] Starting Chunk 69
[2024-06-17 06:28:18,253][INFO] start:feature_engineering
[2024-06-17 06:28:18,254][INFO] df shape: (1343772, 15)
[2024-06-17 06:28:20,252][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (

[2024-06-17 06:36:53,754][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347143, 100)
[2024-06-17 06:36:55,189][INFO] k: item2vec, type: scroll_mean
[2024-06-17 06:36:55,190][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347143, 101)
[2024-06-17 06:36:56,489][INFO] k: item2vec, type: scroll_mean
[2024-06-17 06:36:56,490][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347143, 102)
[2024-06-17 06:36:58,025][INFO] k: item2vec, type: scroll_mean
[2024-06-17 06:36:58,027][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347143, 103)
[2024-06-17 06:36:59,580][INFO] k: item2vec, type: read_time_mean
[2024-06-17 06:36:59,581][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347143, 104)
[2024-06-17 06:37:00,978][INFO] k: item2vec, type: read_time_mean
[2024-06-17 06:37:00,979][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347143, 105)
[2024-06-17 06:37:02,608][INFO] k: item2vec, type: read_time_mean
[2024-0

[2024-06-17 06:45:52,984][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 06:45:52,985][INFO] category_click_cossim/elapsed_seconds: 15/df shape: (1350357, 85)
[2024-06-17 06:46:15,537][INFO] k: multilingual-e5-large-instruct
[2024-06-17 06:46:15,538][INFO] inview_my_cossim/elapsed_seconds: 22/df shape: (1350357, 88)
[2024-06-17 06:47:18,387][INFO] k: multilingual-e5-large-instruct
[2024-06-17 06:47:18,389][INFO] add_cossim_individual_features/elapsed_seconds: 62/df shape: (1350357, 96)
[2024-06-17 06:47:19,902][INFO] k: item2vec, type: mean
[2024-06-17 06:47:19,903][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1350357, 97)
[2024-06-17 06:47:21,692][INFO] k: item2vec, type: mean
[2024-06-17 06:47:21,693][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1350357, 98)
[2024-06-17 06:47:23,159][INFO] k: item2vec, type: mean
[2024-06-17 06:47:23,160][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1350357, 99)
[2024-06-17 06

[2024-06-17 06:54:51,901][INFO] add_past_category_ratios/elapsed_seconds: 5/df shape: (1327321, 75)
[2024-06-17 06:55:07,757][INFO] add_next_impression_id_cossim/elapsed_seconds: 15/df shape: (1327321, 78)
[2024-06-17 06:55:24,339][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 06:55:24,340][INFO] clicked_inview_common_cossim/elapsed_seconds: 16/df shape: (1327321, 80)
[2024-06-17 06:55:32,286][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 06:55:32,288][INFO] category_click_cossim/elapsed_seconds: 7/df shape: (1327321, 81)
[2024-06-17 06:55:45,980][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 06:55:45,982][INFO] clicked_inview_common_cossim/elapsed_seconds: 13/df shape: (1327321, 82)
[2024-06-17 06:56:00,501][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 06:56:00,502][INFO] category_click_cossim/elapsed_seconds: 14/df shape: (1327321, 83)
[2024-06-17 06:56:13,638][INFO] k: multilingual-e5-large-instru

[2024-06-17 07:04:01,972][INFO] add_cossim_by_category/elapsed_seconds: 19/df shape: (1354630, 108)
[2024-06-17 07:04:21,044][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 07:04:21,045][INFO] add_cossim_by_category/elapsed_seconds: 19/df shape: (1354630, 109)
[2024-06-17 07:04:25,441][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1354630, 113)
[2024-06-17 07:04:30,088][INFO] statistic/elapsed_seconds: 4/df shape: (1354630, 404)
[2024-06-17 07:05:09,083][INFO] Starting Chunk 76
[2024-06-17 07:05:11,060][INFO] start:feature_engineering
[2024-06-17 07:05:11,061][INFO] df shape: (1314557, 15)
[2024-06-17 07:05:13,150][INFO] clicked_history_count/elapsed_seconds: 2/df shape: (1314557, 18)
[2024-06-17 07:05:15,584][INFO] inview_cooccur/elapsed_seconds: 2/df shape: (1314557, 28)
[2024-06-17 07:05:15,807][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1314557, 43)
[2024-06-17 07:05:15,916][INFO] user_last_impression_publish_time_diff/elapsed_s

[2024-06-17 07:13:42,882][INFO] k: item2vec, type: scroll_mean
[2024-06-17 07:13:42,884][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1338787, 102)
[2024-06-17 07:13:44,579][INFO] k: item2vec, type: scroll_mean
[2024-06-17 07:13:44,580][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1338787, 103)
[2024-06-17 07:13:46,062][INFO] k: item2vec, type: read_time_mean
[2024-06-17 07:13:46,064][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1338787, 104)
[2024-06-17 07:13:47,336][INFO] k: item2vec, type: read_time_mean
[2024-06-17 07:13:47,337][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1338787, 105)
[2024-06-17 07:13:49,191][INFO] k: item2vec, type: read_time_mean
[2024-06-17 07:13:49,193][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1338787, 106)
[2024-06-17 07:14:09,433][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 07:14:09,435][INFO] add_cossim_by_category/elapsed_seconds: 20/df shape

[2024-06-17 07:23:25,086][INFO] k: multilingual-e5-large-instruct
[2024-06-17 07:23:25,088][INFO] inview_my_cossim/elapsed_seconds: 26/df shape: (1370478, 88)
[2024-06-17 07:24:30,070][INFO] k: multilingual-e5-large-instruct
[2024-06-17 07:24:30,071][INFO] add_cossim_individual_features/elapsed_seconds: 64/df shape: (1370478, 96)
[2024-06-17 07:24:31,556][INFO] k: item2vec, type: mean
[2024-06-17 07:24:31,557][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1370478, 97)
[2024-06-17 07:24:32,905][INFO] k: item2vec, type: mean
[2024-06-17 07:24:32,906][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1370478, 98)
[2024-06-17 07:24:34,331][INFO] k: item2vec, type: mean
[2024-06-17 07:24:34,332][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1370478, 99)
[2024-06-17 07:24:35,628][INFO] k: item2vec, type: mean
[2024-06-17 07:24:35,629][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1370478, 100)
[2024-06-17 07:24:37,024][INFO] k: item2vec

[2024-06-17 07:32:43,988][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 07:32:43,990][INFO] clicked_inview_common_cossim/elapsed_seconds: 16/df shape: (1306010, 80)
[2024-06-17 07:32:54,111][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 07:32:54,112][INFO] category_click_cossim/elapsed_seconds: 10/df shape: (1306010, 81)
[2024-06-17 07:33:06,568][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 07:33:06,569][INFO] clicked_inview_common_cossim/elapsed_seconds: 12/df shape: (1306010, 82)
[2024-06-17 07:33:19,555][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 07:33:19,556][INFO] category_click_cossim/elapsed_seconds: 12/df shape: (1306010, 83)
[2024-06-17 07:33:30,937][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 07:33:30,939][INFO] clicked_inview_common_cossim/elapsed_seconds: 11/df shape: (1306010, 84)
[2024-06-17 07:33:45,080][INFO] k: multilingual-e5-large-instruct, type: re

[2024-06-17 07:41:57,846][INFO] calculate_inview_counts/elapsed_seconds: 3/df shape: (1345315, 113)
[2024-06-17 07:42:02,239][INFO] statistic/elapsed_seconds: 4/df shape: (1345315, 404)
[2024-06-17 07:42:38,626][INFO] Starting Chunk 83
[2024-06-17 07:42:40,023][INFO] start:feature_engineering
[2024-06-17 07:42:40,024][INFO] df shape: (1353703, 15)
[2024-06-17 07:42:41,558][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (1353703, 18)
[2024-06-17 07:42:44,671][INFO] inview_cooccur/elapsed_seconds: 3/df shape: (1353703, 28)
[2024-06-17 07:42:44,900][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1353703, 43)
[2024-06-17 07:42:45,046][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1353703, 44)
[2024-06-17 07:42:49,684][INFO] article_pop/elapsed_seconds: 4/df shape: (1353703, 72)
[2024-06-17 07:42:55,000][INFO] add_past_category_ratios/elapsed_seconds: 5/df shape: (1353703, 75)
[2024-06-17 07:43:11,482][INFO] add_next_impression_id_cossim/elaps

[2024-06-17 07:51:41,648][INFO] k: item2vec, type: read_time_mean
[2024-06-17 07:51:41,650][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1357157, 104)
[2024-06-17 07:51:43,287][INFO] k: item2vec, type: read_time_mean
[2024-06-17 07:51:43,289][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1357157, 105)
[2024-06-17 07:51:45,242][INFO] k: item2vec, type: read_time_mean
[2024-06-17 07:51:45,244][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1357157, 106)
[2024-06-17 07:52:04,959][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 07:52:04,960][INFO] add_cossim_by_category/elapsed_seconds: 19/df shape: (1357157, 107)
[2024-06-17 07:52:22,628][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 07:52:22,630][INFO] add_cossim_by_category/elapsed_seconds: 17/df shape: (1357157, 108)
[2024-06-17 07:52:39,812][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 07:52:39,813][INFO] a

[2024-06-17 08:01:58,856][INFO] k: item2vec, type: mean
[2024-06-17 08:01:58,858][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347570, 97)
[2024-06-17 08:02:00,221][INFO] k: item2vec, type: mean
[2024-06-17 08:02:00,223][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347570, 98)
[2024-06-17 08:02:01,624][INFO] k: item2vec, type: mean
[2024-06-17 08:02:01,625][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347570, 99)
[2024-06-17 08:02:03,002][INFO] k: item2vec, type: mean
[2024-06-17 08:02:03,004][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347570, 100)
[2024-06-17 08:02:04,440][INFO] k: item2vec, type: scroll_mean
[2024-06-17 08:02:04,442][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347570, 101)
[2024-06-17 08:02:05,949][INFO] k: item2vec, type: scroll_mean
[2024-06-17 08:02:05,950][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1347570, 102)
[2024-06-17 08:02:07,496][INFO] k: item2vec, ty

[2024-06-17 08:10:27,250][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 08:10:27,253][INFO] clicked_inview_common_cossim/elapsed_seconds: 12/df shape: (1344933, 82)
[2024-06-17 08:10:42,116][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 08:10:42,117][INFO] category_click_cossim/elapsed_seconds: 14/df shape: (1344933, 83)
[2024-06-17 08:10:55,739][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 08:10:55,740][INFO] clicked_inview_common_cossim/elapsed_seconds: 13/df shape: (1344933, 84)
[2024-06-17 08:11:09,506][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 08:11:09,508][INFO] category_click_cossim/elapsed_seconds: 13/df shape: (1344933, 85)
[2024-06-17 08:11:32,730][INFO] k: multilingual-e5-large-instruct
[2024-06-17 08:11:32,732][INFO] inview_my_cossim/elapsed_seconds: 23/df shape: (1344933, 88)
[2024-06-17 08:12:49,549][INFO] k: multilingual-e5-large-instruct
[2024-06-17 08:12:49,55

[2024-06-17 08:20:07,984][INFO] df shape: (1342262, 15)
[2024-06-17 08:20:10,167][INFO] clicked_history_count/elapsed_seconds: 2/df shape: (1342262, 18)
[2024-06-17 08:20:12,901][INFO] inview_cooccur/elapsed_seconds: 2/df shape: (1342262, 28)
[2024-06-17 08:20:13,225][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1342262, 43)
[2024-06-17 08:20:13,342][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1342262, 44)
[2024-06-17 08:20:17,185][INFO] article_pop/elapsed_seconds: 3/df shape: (1342262, 72)
[2024-06-17 08:20:21,799][INFO] add_past_category_ratios/elapsed_seconds: 4/df shape: (1342262, 75)
[2024-06-17 08:20:39,626][INFO] add_next_impression_id_cossim/elapsed_seconds: 17/df shape: (1342262, 78)
[2024-06-17 08:20:55,160][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 08:20:55,161][INFO] clicked_inview_common_cossim/elapsed_seconds: 15/df shape: (1342262, 80)
[2024-06-17 08:21:03,754][INFO] k: multilingual-e5-large-instruct, type:

[2024-06-17 08:29:05,775][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1331153, 105)
[2024-06-17 08:29:07,268][INFO] k: item2vec, type: read_time_mean
[2024-06-17 08:29:07,269][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1331153, 106)
[2024-06-17 08:29:29,878][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 08:29:29,879][INFO] add_cossim_by_category/elapsed_seconds: 22/df shape: (1331153, 107)
[2024-06-17 08:29:49,203][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 08:29:49,205][INFO] add_cossim_by_category/elapsed_seconds: 19/df shape: (1331153, 108)
[2024-06-17 08:30:08,619][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 08:30:08,620][INFO] add_cossim_by_category/elapsed_seconds: 19/df shape: (1331153, 109)
[2024-06-17 08:30:12,676][INFO] calculate_inview_counts/elapsed_seconds: 4/df shape: (1331153, 113)
[2024-06-17 08:30:17,669][INFO] statistic/elapsed_seconds: 4/df sh

[2024-06-17 08:39:51,274][INFO] k: item2vec, type: mean
[2024-06-17 08:39:51,276][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1351731, 99)
[2024-06-17 08:39:54,357][INFO] k: item2vec, type: mean
[2024-06-17 08:39:54,359][INFO] clicked_inview_my_cossim/elapsed_seconds: 3/df shape: (1351731, 100)
[2024-06-17 08:39:55,994][INFO] k: item2vec, type: scroll_mean
[2024-06-17 08:39:55,995][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1351731, 101)
[2024-06-17 08:39:57,482][INFO] k: item2vec, type: scroll_mean
[2024-06-17 08:39:57,483][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1351731, 102)
[2024-06-17 08:39:59,776][INFO] k: item2vec, type: scroll_mean
[2024-06-17 08:39:59,782][INFO] clicked_inview_my_cossim/elapsed_seconds: 2/df shape: (1351731, 103)
[2024-06-17 08:40:01,928][INFO] k: item2vec, type: read_time_mean
[2024-06-17 08:40:01,929][INFO] clicked_inview_my_cossim/elapsed_seconds: 2/df shape: (1351731, 104)
[2024-06-17 08:40:04,005][IN

[2024-06-17 08:48:40,687][INFO] category_click_cossim/elapsed_seconds: 16/df shape: (1345734, 83)
[2024-06-17 08:48:55,849][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 08:48:55,854][INFO] clicked_inview_common_cossim/elapsed_seconds: 15/df shape: (1345734, 84)
[2024-06-17 08:49:09,158][INFO] k: multilingual-e5-large-instruct, type: read_time_mean
[2024-06-17 08:49:09,159][INFO] category_click_cossim/elapsed_seconds: 13/df shape: (1345734, 85)
[2024-06-17 08:49:29,510][INFO] k: multilingual-e5-large-instruct
[2024-06-17 08:49:29,511][INFO] inview_my_cossim/elapsed_seconds: 20/df shape: (1345734, 88)
[2024-06-17 08:50:33,098][INFO] k: multilingual-e5-large-instruct
[2024-06-17 08:50:33,100][INFO] add_cossim_individual_features/elapsed_seconds: 63/df shape: (1345734, 96)
[2024-06-17 08:50:35,062][INFO] k: item2vec, type: mean
[2024-06-17 08:50:35,063][INFO] clicked_inview_my_cossim/elapsed_seconds: 1/df shape: (1345734, 97)
[2024-06-17 08:50:36,658][INFO] k: 

[2024-06-17 08:58:13,764][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (1336042, 43)
[2024-06-17 08:58:13,876][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (1336042, 44)
[2024-06-17 08:58:17,675][INFO] article_pop/elapsed_seconds: 3/df shape: (1336042, 72)
[2024-06-17 08:58:21,898][INFO] add_past_category_ratios/elapsed_seconds: 4/df shape: (1336042, 75)
[2024-06-17 08:58:37,398][INFO] add_next_impression_id_cossim/elapsed_seconds: 15/df shape: (1336042, 78)
[2024-06-17 08:58:52,729][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 08:58:52,730][INFO] clicked_inview_common_cossim/elapsed_seconds: 15/df shape: (1336042, 80)
[2024-06-17 08:59:01,897][INFO] k: multilingual-e5-large-instruct, type: mean
[2024-06-17 08:59:01,899][INFO] category_click_cossim/elapsed_seconds: 9/df shape: (1336042, 81)
[2024-06-17 08:59:14,818][INFO] k: multilingual-e5-large-instruct, type: scroll_mean
[2024-06-17 08:59:14,820][INFO] clicked_inview_common_c

[2024-06-17 09:07:10,679][INFO] k: multilingual-e5-large-instruct, cat_col: device_type
[2024-06-17 09:07:10,681][INFO] add_cossim_by_category/elapsed_seconds: 18/df shape: (1333149, 107)
[2024-06-17 09:07:28,173][INFO] k: multilingual-e5-large-instruct, cat_col: is_subscriber
[2024-06-17 09:07:28,175][INFO] add_cossim_by_category/elapsed_seconds: 17/df shape: (1333149, 108)
[2024-06-17 09:07:46,607][INFO] k: multilingual-e5-large-instruct, cat_col: is_sso_user
[2024-06-17 09:07:46,609][INFO] add_cossim_by_category/elapsed_seconds: 18/df shape: (1333149, 109)
[2024-06-17 09:07:50,426][INFO] calculate_inview_counts/elapsed_seconds: 3/df shape: (1333149, 113)
[2024-06-17 09:07:54,708][INFO] statistic/elapsed_seconds: 4/df shape: (1333149, 404)
[2024-06-17 09:08:32,485][INFO] Starting Chunk 99
[2024-06-17 09:08:34,118][INFO] start:feature_engineering
[2024-06-17 09:08:34,120][INFO] df shape: (1342014, 15)
[2024-06-17 09:08:36,025][INFO] clicked_history_count/elapsed_seconds: 1/df shape: (

# Valid

In [None]:
n_chunks = 100

val_df = get_target_df(val_behaviors)
unique_user_ids = val_df["user_id"].unique().to_numpy()

np.random.shuffle(unique_user_ids)
user_id_splits = np.array_split(unique_user_ids, n_chunks)

for _chunk in range(n_chunks):
    
    logger.info(f"Starting Chunk {_chunk}")

    df_chunk = val_df.filter(pl.col("user_id").is_in(set(user_id_splits[_chunk])))
    df_history_chunk = val_history.filter(pl.col("user_id").is_in(set(user_id_splits[_chunk])))
    df_chunk = create_feature(df_chunk,df_history_chunk,mode='valid')
    
    df_chunk.write_parquet(f"./val_large_chunks/val_df_chunk{_chunk}.parquet")
    
    del df_chunk
    import gc; gc.collect()

# Test

In [39]:
# Check if the number of rows in test_behaviors is equal to the number of unique combinations of impression_id and user_id
assert len(test_behaviors) == len(test_behaviors.groupby(['impression_id','user_id']).count())

sub_impression = test_behaviors.select(['impression_id','user_id'])

test_behaviors = test_behaviors.with_columns(
    pl.col("article_ids_inview").apply(lambda x: len(x)).alias("count_article_ids_inview"),
)

# Explode article_ids_inview
test_behaviors = test_behaviors.explode("article_ids_inview")
test_behaviors = test_behaviors.with_columns(pl.col("article_ids_inview").alias("article_id"))

# Get unique user_id
test_user = test_behaviors['user_id'].unique().to_numpy()

In [None]:
n = 100

test_user_list = np.array_split(test_user,n)

sub_df_list = []

for no,user_list in enumerate(test_user_list):
    logger.info(f'no : {no} / num_user: {len(user_list)}')

    _test_df = test_behaviors.filter(pl.col('user_id').is_in(user_list))
    _test_history = test_history.filter(pl.col('user_id').is_in(user_list))

    test_df = create_feature(_test_df,_test_history,mode='test',logging = False)
    
    test_df.write_parquet(f"./test_large_chunks/test_df_chunk{no}.parquet")
    
    del test_df
    import gc; gc.collect()

[2024-06-17 01:33:02,359][INFO] no : 0 / num_user: 8077
[2024-06-17 01:41:28,538][INFO] no : 1 / num_user: 8077
[2024-06-17 01:49:45,406][INFO] no : 2 / num_user: 8077
[2024-06-17 01:58:17,655][INFO] no : 3 / num_user: 8077
[2024-06-17 02:06:20,325][INFO] no : 4 / num_user: 8077
[2024-06-17 02:14:46,663][INFO] no : 5 / num_user: 8077
[2024-06-17 02:22:56,167][INFO] no : 6 / num_user: 8077
[2024-06-17 02:31:13,071][INFO] no : 7 / num_user: 8077
[2024-06-17 02:39:29,354][INFO] no : 8 / num_user: 8077
[2024-06-17 02:47:37,654][INFO] no : 9 / num_user: 8077
[2024-06-17 02:55:59,123][INFO] no : 10 / num_user: 8077
[2024-06-17 03:03:55,807][INFO] no : 11 / num_user: 8077
[2024-06-17 03:12:33,906][INFO] no : 12 / num_user: 8077
[2024-06-17 03:20:33,464][INFO] no : 13 / num_user: 8077
[2024-06-17 03:28:48,979][INFO] no : 14 / num_user: 8077
[2024-06-17 03:36:46,328][INFO] no : 15 / num_user: 8077
[2024-06-17 03:44:33,830][INFO] no : 16 / num_user: 8077
[2024-06-17 03:52:23,390][INFO] no : 17 /