# Import Libraries

In [1]:
from datetime import datetime, timedelta
import os

import numpy as np
import pandas as pd
from pathlib import Path
import polars as pl

import torch
import torch.nn.functional as F

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 1000)

# Logging
import logging

# Get logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('logs.log')

# Set log format
formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Settings to display log on notebook
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# Data Load

In [2]:
TRAIN_DIR = Path('/home/data/train_small')
OUTPUT_DIR = Path(f'/home/code_for_sub/3.ablation/2.embedding/image')

In [3]:
trn_behaviors = pl.read_parquet(TRAIN_DIR/'train'/'behaviors.parquet')
trn_history = pl.read_parquet(TRAIN_DIR/'train'/'history.parquet')

val_behaviors = pl.read_parquet(TRAIN_DIR/'validation'/'behaviors.parquet')
val_history = pl.read_parquet(TRAIN_DIR/'validation'/'history.parquet')

articles = pl.read_parquet(TRAIN_DIR/'articles.parquet')

# Extend Validation/Test History Files

In [4]:
_trn_history = trn_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])
_val_history = val_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])

val_history_extended = pl.concat([
    _trn_history.filter(pl.col("user_id").is_in(set(val_history["user_id"]) & set(trn_history["user_id"]))),
    _val_history
])
val_history = val_history_extended.sort(
    ["user_id", "impression_time_fixed"]
).unique().groupby("user_id").agg(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])

# val_history.write_parquet(OUTPUT_DIR/'validation'/"history_extended.parquet")

In [5]:
trn_behaviors = pl.read_parquet(TRAIN_DIR/'train'/'behaviors.parquet')
trn_history = pl.read_parquet(TRAIN_DIR/'train'/'history.parquet')

val_behaviors = pl.read_parquet(TRAIN_DIR/'validation'/'behaviors.parquet')
val_history = pl.read_parquet(TRAIN_DIR/'validation'/'history.parquet')

articles = pl.read_parquet(TRAIN_DIR/'articles.parquet')

_trn_history = trn_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])
_val_history = val_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"])

# Create Dataset

In [6]:
def get_target_df(
        df: pl.DataFrame
) -> pl.DataFrame:
    '''
    Create target_df. The target column is "is_clicked"
    '''
    
    df = df.with_columns(
    pl.col("article_ids_inview").apply(lambda x: len(x)).alias("count_article_ids_inview"),
    )

    # Explode article_ids_inview
    df = df.explode("article_ids_inview")

    # If article_ids_inview is in article_ids_clicked, then 1, otherwise 0
    df = df.with_columns(
        pl.when(pl.col("article_ids_inview").is_in(pl.col("article_ids_clicked")))
        .then(1)
        .otherwise(0)
        .alias("is_clicked")
    )

    # Delete columns that are not used
    df = df.drop(["article_ids_clicked","next_read_time","next_scroll_percentage","article_id"])

    # Change the name of article_ids_inview to article_id
    df = df.with_columns(pl.col("article_ids_inview").alias("article_id"))
    df = df.drop("article_ids_inview")

    # Calculate the rate of is_clicked
    is_clicked_rate = df.select("is_clicked").mean().to_pandas().iloc[0,0]
    logger.info(f'is_clicked_rate: {is_clicked_rate}')

    # Show the shape of df
    logger.info(f'df shape: {df.shape}')

    return df

# Feature Engineering

### Inview Cooccurance Feature

In [7]:
def feat_inview_cooccur(
        df: pl.DataFrame, mode: str
) -> pl.DataFrame:
    '''
    Create features that show the number of co-visits between articles.
    '''

    _len = len(df)

    # Load the pre-computed co-visit counts
    df_covisit_count = pl.read_parquet(f'/home/data/inview_cooccur/{mode}_covisit_count.parquet')

    tmp_df = df.select(['impression_id','user_id','article_id'])

    # Remove impression_id = 0
    tmp_df = tmp_df.filter(pl.col('impression_id') != 0)

    tmp_df = tmp_df.join(tmp_df, on = ['impression_id','user_id'],how='left')

    tmp_df = tmp_df.join(df_covisit_count, on = ['article_id','article_id_right'],how='left')

    feat_df = tmp_df.groupby(['impression_id','user_id','article_id']).agg(
    pl.sum('cooccur_count').alias('cooccur_count_sum'),
    pl.mean('cooccur_count').alias('cooccur_count_mean'),
    pl.max('cooccur_count').alias('cooccur_count_max'),
    pl.min('cooccur_count').alias('cooccur_count_min'),
    pl.std('cooccur_count').alias('cooccur_count_std'),
    )

    # Normalize each feature by the maximum value for each impression_id
    feat_df = feat_df.join(
        feat_df.groupby('impression_id').agg(
            pl.max('cooccur_count_sum').alias('cooccur_count_sum_max'),
            pl.max('cooccur_count_mean').alias('cooccur_count_mean_max'),
            pl.max('cooccur_count_max').alias('cooccur_count_max_max'),
            pl.max('cooccur_count_min').alias('cooccur_count_min_max'),
            pl.max('cooccur_count_std').alias('cooccur_count_std_max'),
        ),
        on='impression_id'
    )
    
    # Normalize each feature by the maximum value for each impression_id
    feat_df = feat_df.with_columns(
        (pl.col('cooccur_count_sum')/pl.col('cooccur_count_sum_max')).alias('cooccur_count_sum_norm'),
        (pl.col('cooccur_count_mean')/pl.col('cooccur_count_mean_max')).alias('cooccur_count_mean_norm'),
        (pl.col('cooccur_count_max')/pl.col('cooccur_count_max_max')).alias('cooccur_count_max_norm'),
        (pl.col('cooccur_count_min')/pl.col('cooccur_count_min_max')).alias('cooccur_count_min_norm'),
        (pl.col('cooccur_count_std')/pl.col('cooccur_count_std_max')).alias('cooccur_count_std_norm'),
    )

    feat_df = feat_df.drop(['cooccur_count_sum_max','cooccur_count_mean_max','cooccur_count_max_max','cooccur_count_min_max','cooccur_count_std_max'])
    df = df.join(feat_df, on = ['impression_id','user_id','article_id'],how='left')

    assert _len == len(df)

    return df

### History Click Counts

In [8]:
def feat_clicked_history_count(
        df: pl.DataFrame, 
        df_history: pl.DataFrame
) -> pl.DataFrame:
    '''
    Add features that show the number of times an article has been clicked in the past.
    '''
    _len = len(df)

    user_article = df.select(['article_id','user_id','impression_time'])

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']

    # Get the combination of user_id and article_id_fixed
    df_history = df_history.join(user_article, on=['user_id','article_id'], how='inner')

    feat_df = df_history.groupby(['user_id','article_id']).agg(
        # Count the number of times the article has been read
        pl.count('impression_time_history').alias('article_read_count'),
        # Get the latest time the article was read
        pl.max('impression_time_history').alias('article_last_read_time'),
    )

    df = df.join(feat_df, on=['user_id','article_id'], how='left')

    # Convert the difference between last_read_time and impression_time to X hours
    df = df.with_columns(
        ((pl.col('impression_time') - pl.col('article_last_read_time')) / timedelta(hours=1)).alias('article_last_read_time_diff')
    ).drop('article_last_read_time')

    # Fill the missing values in read_count, max_scroll_percentage, max_read_time with 0
    df = df.with_columns([
        pl.col("article_read_count").fill_null(0),
    ])

    assert _len == len(df)
    
    return df

In [9]:
def feat_clicked_history_count_by_article(
        df: pl.DataFrame, 
        df_history: pl.DataFrame
) -> pl.DataFrame:
    '''
    Add features that show the number of times an article has been clicked in the past.
    '''
    _len = len(df)

    user_article = df.select(['article_id','impression_time'])

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']

    # Get the combination of user_id and article_id_fixed
    df_history = df_history.join(user_article, on=['article_id'], how='inner')

    feat_df = df_history.groupby(['article_id']).agg(
        # Count the number of times the article has been read
        pl.count('impression_time_history').alias('article_read_count_v2'),
    )

    df = df.join(feat_df, on=['article_id'], how='left')
    
    assert _len == len(df)
    
    return df

### Load Vector Parquet Files as DataFrames

In [10]:
def load_vector_df(path_str: str) -> pl.DataFrame:
    '''
    Load the vector. The vector is provided by the original data.
    '''
    _vec = pd.read_parquet(path_str)

    logger.info(f'_vec columns : {_vec.columns}')

    col_name = _vec.columns[-1]

    df_vec = _vec.apply(lambda row: pd.Series(row[col_name]), axis=1)
    df_vec.columns = [f'vector_{i}' for i in range(df_vec.shape[1])]

    df_vec['article_id'] = _vec['article_id']

    df_vec = pl.from_pandas(df_vec)

    logger.info(f'{path_str} shape: {df_vec.shape}')

    return df_vec

def load_my_vector_df(path_str: str) -> pl.DataFrame:
    df_vec = pl.read_parquet(path_str)
    return df_vec
    

common_vec_dict = {
    # 'contrastive':load_vector_df('/home/data/Ekstra_Bladet_contrastive_vector/contrastive_vector.parquet'),
    #'w2v':load_vector_df('/home/data/Ekstra_Bladet_word2vec/document_vector.parquet'),
    #'xlm':load_vector_df('/home/data/FacebookAI_xlm_roberta_base/xlm_roberta_base.parquet'),
    #'bert':load_vector_df('/home/data/google_bert_base_multilingual_cased/bert_base_multilingual_cased.parquet'),
    'image':load_vector_df('/home/data/Ekstra_Bladet_image_embeddings/image_embeddings.parquet'),
    #'bge-m3':load_my_vector_df('/home/data/bge-m3-dense/bge-m3-dense_vec_df.parquet'),
    #'multilingual-e5-large-instruct':load_my_vector_df('/home/data/multilingual-e5-large-instruct/multilingual-e5-large-instruct_vec_df.parquet')
}

my_vec_dict = {
    'item2vec':{
        'train':load_my_vector_df('/home/data/item2vec_1/train_item2vec.parquet'),
        'valid':load_my_vector_df('/home/data/item2vec_1/valid_item2vec.parquet'),
        'test':load_my_vector_df('/home/data/item2vec_1/test_item2vec.parquet'),
    }
}

[2024-07-07 12:59:22,077][INFO] _vec columns : Index(['article_id', 'image_embedding'], dtype='object')
[2024-07-07 12:59:49,615][INFO] /home/data/Ekstra_Bladet_image_embeddings/image_embeddings.parquet shape: (102603, 1025)


#### Cossim Related Features

In [11]:
def feat_clicked_cossim(
        df: pl.DataFrame,
        df_history: pl.DataFrame,
        df_vec: pl.DataFrame,
        cossim_name: str,
        type: str='mean',
        window: int=1,
) -> pl.DataFrame:
    '''
    type:
     mean: Cosine similarity between the average vector of the user's past actions and the article vector
     scroll_mean: Cosine similarity between the average vector of the user's past actions and the article vector
     read_time_mean: Cosine similarity between the average vector of the user's past actions and the article vector
    '''

    _len = len(df)

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']
    df_history = df_history.fill_null(0)

    user_article = df.select(['article_id','user_id','impression_time','impression_id'])

    df_history = df_history.join(df_vec, on='article_id', how='left')
    vec_cols = [col for col in df_history.columns if 'vector' in col]

    # User embeddings
    # How many past actions to consider with the window (if window is 0, use the entire history)
    if window > 0:
        df_history = df_history.sort('impression_time_history', descending=True).groupby('user_id').head(window)

    if type == 'mean':
        user_emb = df_history.groupby('user_id').agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        ) # Simple mean of the latest article emb
    elif type == 'max':
        user_emb = df_history.groupby('user_id').agg(
            *[pl.max(col).alias(col) for col in vec_cols]
        ) # Simple max of the latest article emb
    elif type == 'min':
        user_emb = df_history.groupby('user_id').agg(
            *[pl.min(col).alias(col) for col in vec_cols]
        ) # Simple min of the latest article emb
    elif type == 'scroll_mean':
        # Scroll_percentage based mean of the latest article emb
        for col in vec_cols:
            df_history = df_history.with_columns(
                (pl.col(col) * pl.col('scroll_percentage_history') / 100).alias(col)
            )
        user_emb = df_history.groupby('user_id').agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )
    elif type == 'read_time_mean':
        # Read time based mean of the latest article emb
        user_read_time = df_history.groupby('user_id').agg(
            pl.sum('read_time_history').alias('user_read_time')
        )
        df_history = df_history.join(user_read_time, on='user_id', how='left')
        df_history = df_history.with_columns(
            (pl.col('read_time_history') / pl.col('user_read_time')).alias('read_time_percentage')
        )
        for col in vec_cols:
            df_history = df_history.with_columns(
                (pl.col(col) * pl.col('read_time_percentage')).alias(col)
            )
        user_emb = df_history.groupby('user_id').agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )
    
    user_emb.columns = [f'user_{col}' if col != 'user_id' else col for col in user_emb.columns]

    # Article embeddings
    article_emb = df_vec.clone()
    article_emb.columns = [f'article_{col}' if col != 'article_id' else col for col in article_emb.columns]

    # Join user and article embeddings
    user_article = user_article.join(
    user_emb, on='user_id', how='left').join(
        article_emb, on='article_id', how='left')

    # Calculate cosine similarities
    user_vec = user_article.select([col for col in user_article.columns if 'user_vec' in col]).to_numpy()
    article_vec = user_article.select([col for col in user_article.columns if 'article_vec' in col]).to_numpy()

    user_vec = torch.tensor(user_vec)
    article_vec = torch.tensor(article_vec)

    similarity = F.cosine_similarity(
        user_vec, article_vec, dim=1)
    
    # Add as a feature
    user_article = user_article.with_columns(
        pl.Series(similarity.numpy()).alias(cossim_name)
    )

    feat_df = user_article.select(['user_id','article_id','impression_id',cossim_name])

    df = df.join(feat_df, on=['user_id','article_id','impression_id'], how='left')

    assert _len == len(df)

    return df

In [12]:
def feat_clicked_category_cossim(
        df: pl.DataFrame,
        df_history: pl.DataFrame,
        df_vec: pl.DataFrame,
        cossim_name: str,
        type: str='mean',
        window: int=1,
) -> pl.DataFrame:
    '''
    type:
     mean: Cosine similarity between the average vector of the user's past actions and the article vector
     scroll_mean: Cosine similarity between the average vector of the user's past actions and the article vector
     read_time_mean: Cosine similarity between the average vector of the user's past actions and the article vector
    '''
    _len = len(df)

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']
    df_history = df_history.fill_null(0)
    df_history = df_history.join(articles.select(['article_id', 'category']), how='left', on='article_id')

    user_article = df.select(['article_id','user_id','category','impression_time','impression_id'])

    df_history = df_history.join(df_vec, on='article_id', how='left')
    vec_cols = [col for col in df_history.columns if 'vector' in col]

    # User embeddings
    # How many past actions to consider with the window (if window is 0, use the entire history)
    if window > 0:
        df_history = df_history.sort('impression_time_history',descending=True
                                     ).groupby(['user_id', 'category']).head(window)

    if type == 'mean':
        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )

    elif type == 'min':
        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.min(col).alias(col) for col in vec_cols]
        )
        
    elif type == 'max':
        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.max(col).alias(col) for col in vec_cols]
        )

    elif type == 'scroll_mean':
        for col in vec_cols:
            df_history = df_history.with_columns(
                (pl.col(col) * pl.col('scroll_percentage_history') / 100).alias(col)
            )
        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )

    elif type == 'read_time_mean':
        user_read_time = df_history.groupby(['user_id', 'category']).agg(
            pl.sum('read_time_history').alias('user_read_time')
        )
        df_history = df_history.join(user_read_time, on=['user_id', 'category'], how='left')
        df_history = df_history.with_columns(
            (pl.col('read_time_history') / pl.col('user_read_time')).alias('read_time_percentage')
        )
        for col in vec_cols:
            df_history = df_history.with_columns(
                (pl.col(col) * pl.col('read_time_percentage')).alias(col)
            )

        user_emb = df_history.groupby(['user_id', 'category']).agg(
            *[pl.mean(col).alias(col) for col in vec_cols]
        )
    
    user_emb.columns = [f'user_{col}' if col not in ['user_id', 'category'] else col for col in user_emb.columns]


    # Article embeddings
    article_emb = df_vec.clone()
    article_emb.columns = [f'article_{col}' if col != 'article_id' else col for col in article_emb.columns]

    # Join user and article embeddings
    user_article = user_article.join(
    user_emb, on=['user_id', 'category'], how='left').join(
        article_emb, on='article_id', how='left')

    # Calculate cosine similarities
    user_vec = user_article.select([col for col in user_article.columns if 'user_vec' in col]).to_numpy()
    article_vec = user_article.select([col for col in user_article.columns if 'article_vec' in col]).to_numpy()

    user_vec = torch.tensor(user_vec)
    article_vec = torch.tensor(article_vec)


    similarity = F.cosine_similarity(
        user_vec, article_vec, dim=1)
    
    # Add as a feature
    user_article = user_article.with_columns(
        pl.Series(similarity.numpy()).alias(cossim_name)
    )

    feat_df = user_article.select(['user_id','category','article_id','impression_id',cossim_name])

    df = df.join(feat_df, on=['user_id','category','article_id','impression_id'], how='left')

    assert _len == len(df)

    return df

In [13]:
def add_cossim_individual_features(
        df: pl.DataFrame, 
        df_history: pl.DataFrame, 
        df_vec: pl.DataFrame, 
        cossim_name: str
) -> pl.DataFrame:
    """Add cossim similarities between the article and the past articles.
    """

    _len = len(df)

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']
    df_history = df_history.fill_null(0).rename({"article_id": "past_article_id"})

    user_article = df.select(['article_id','user_id','impression_time','impression_id'])
    user_article = user_article.join(df_history.select(["user_id", "past_article_id"]), on="user_id")

    user_article_unique = user_article.unique(["article_id", "past_article_id"]).drop(["user_id", "impression_time", "impression_id"])

    # Article embeddings
    article_emb = df_vec.clone()
    article_emb.columns = [f'article_{col}' if col != 'article_id' else col for col in article_emb.columns]

    user_article_unique = user_article_unique.join(
        article_emb, on='article_id', how='left')


    article_emb.columns = ["past_" + col if col != "article_id" else col for col in article_emb.columns]

    user_article_unique = user_article_unique.join(
        article_emb, left_on='past_article_id', right_on="article_id", how='left')

    article_vec = user_article_unique.select(
        [col for col in user_article_unique.columns if ('article_vec' in col) & ("past_article_vec" not in col)]
    ).to_numpy()

    past_article_vec = user_article_unique.select(
        [col for col in user_article_unique.columns if ("past_article_vec" in col)]
    ).to_numpy()

    article_vec = torch.tensor(article_vec)
    past_article_vec = torch.tensor(past_article_vec)

    similarity = F.cosine_similarity(
        article_vec, past_article_vec, dim=1)

    article_similarities = pl.DataFrame(
        {
            "article_id": user_article_unique["article_id"],
            "past_article_id": user_article_unique["past_article_id"],
            "cossim": similarity.numpy()
        }
    )

    user_article = user_article.join(article_similarities, how="left", on=["article_id", "past_article_id"])

    # Calculate features from cossim
    features = user_article.groupby(["impression_id", "article_id"]).agg(
        pl.col("cossim").max().alias(f"{cossim_name}_max_cossim"),
        pl.col("cossim").min().alias(f"{cossim_name}_min_cossim"),
        pl.col("cossim").mean().alias(f"{cossim_name}_mean_cossim"),
        pl.col("cossim").median().alias(f"{cossim_name}_median_cossim"),
        pl.col("cossim").quantile(0.1).alias(f"{cossim_name}_quantile_q10_cossim"),
        pl.col("cossim").quantile(0.25).alias(f"{cossim_name}_quantile_q25_cossim"),
        pl.col("cossim").quantile(0.75).alias(f"{cossim_name}_quantile_q75_cossim"),
        pl.col("cossim").quantile(0.9).alias(f"{cossim_name}_quantile_q90_cossim"),
    )
    
    df = df.join(features, how="left", on=["impression_id", "article_id"])
    
    # Normalize each feature by the maximum value for each impression_id
    for col in [
        f"{cossim_name}_max_cossim",
        f"{cossim_name}_min_cossim",
        f"{cossim_name}_mean_cossim",
        f"{cossim_name}_median_cossim",
        f"{cossim_name}_quantile_q10_cossim",
        f"{cossim_name}_quantile_q25_cossim",
        f"{cossim_name}_quantile_q75_cossim",
        f"{cossim_name}_quantile_q90_cossim",
    ]:
        df = df.join(
            df.groupby('impression_id').agg(
                pl.max(col).alias(f'max_{col}'),
                pl.min(col).alias(f'min_{col}'),
            ), on='impression_id', how='left'
        )

        df = df.with_columns(
            ((pl.col(col) - pl.col(f'min_{col}')) / (pl.col(f'max_{col}') - pl.col(f'min_{col}'))).alias(f'{col}_norm')
        ).drop([f'max_{col}', f'min_{col}', col])

    return df

In [14]:
def feat_inview_cossim(
        df:pl.DataFrame,
        df_vec:pl.DataFrame,
        cossim_name:str,
) -> pl.DataFrame:
        '''
        Create features based on the cosine similarity between the user's past history and the article.
        '''

        _len = len(df)

        # Article embeddings
        article_emb = df_vec.clone()
        article_emb.columns = [f'article_{col}' if col != 'article_id' else col for col in article_emb.columns]

        # User embeddings
        df_user = df.select(['impression_id','user_id','article_id','session_id']).clone()
        df_user = df_user.join(df_vec, on='article_id', how='left')
        vec_cols = [col for col in df_user.columns if 'vector' in col]

        for group_col in ['user_id','session_id','impression_id']:
            user_emb = df_user.groupby(group_col).agg(
                    *[pl.mean(col).alias(col) for col in vec_cols]
                    )
            user_emb.columns = [f'user_{col}' if col != group_col else col for col in user_emb.columns]

            user_article = df_user.join(
                    user_emb, on=group_col, how='left').join(
                    article_emb, on='article_id', how='left')
        
            # Calculate cosine similarities
            user_vec = user_article.select([col for col in user_article.columns if 'user_vec' in col]).to_numpy()
            article_vec = user_article.select([col for col in user_article.columns if 'article_vec' in col]).to_numpy()

            user_vec = torch.tensor(user_vec)
            article_vec = torch.tensor(article_vec)

            similarity = F.cosine_similarity(
                    user_vec, article_vec, dim=1)
            
            user_article = user_article.with_columns(
                    pl.Series(similarity.numpy()).alias(f'{cossim_name}_{group_col}')
                    )
            
            feat_df = user_article.select(['impression_id','user_id','article_id',f'{cossim_name}_{group_col}'])

            df = df.join(feat_df, on=['impression_id','user_id','article_id'], how='left')

            assert _len == len(df)

        return df

### Simple Article Features

In [15]:
def feat_article_simple(df,articles):
    """Add simple features from articles"""
    _len = len(df)
    
    articles_numfeat = articles.select([
    'article_id','premium','published_time','total_inviews','total_pageviews','total_read_time',
    'sentiment_score','sentiment_label', 'image_ids'])
    
    articles_numfeat = articles_numfeat.with_columns(pl.col("image_ids").list.lengths().alias("number_of_images")).drop("image_ids")
    
    articles_numfeat = articles_numfeat.with_columns(
            pl.when(pl.col('sentiment_label') == 'Positive').then(pl.col('sentiment_score')).otherwise(0).alias('article_label_positive_score'),
            pl.when(pl.col('sentiment_label') == 'Negative').then(pl.col('sentiment_score')).otherwise(0).alias('article_label_negative_score'),
            pl.when(pl.col('sentiment_label') == 'Neutral').then(pl.col('sentiment_score')).otherwise(0).alias('article_label_neutral_score')
    ).drop('sentiment_score','sentiment_label')
    df = df.join(articles_numfeat, on='article_id', how='left')

    # Calculate the difference between the impression time and the published time
    df = df.with_columns(
        ((pl.col('impression_time') - pl.col('published_time')) / timedelta(days=1)).alias('published_time_diff')
    ).drop('published_time')

    # Normalize total_inviews, total_pageviews, total_read_time by the maximum value for each impression_id
    df = df.join(
        df.groupby('impression_id').agg(
            pl.max('total_inviews').alias('max_total_inviews'),
            pl.max('total_pageviews').alias('max_total_pageviews'),
            pl.max('total_read_time').alias('max_total_read_time')
        ), on='impression_id', how='left'
    )

    df = df.with_columns(
        (pl.col('total_inviews') / pl.col('max_total_inviews')).alias('total_inviews_norm'),
        (pl.col('total_pageviews') / pl.col('max_total_pageviews')).alias('total_pageviews_norm'),
        (pl.col('total_read_time') / pl.col('max_total_read_time')).alias('total_read_time_norm')
    ).drop(['max_total_inviews','max_total_pageviews','max_total_read_time'])

    # Normalize published_time_diff by the maximum value for each impression_id
    df = df.join(
        df.groupby('impression_id').agg(
            pl.min('published_time_diff').alias('min_published_time_diff')
        ), on='impression_id', how='left'
    )

    df = df.with_columns(
        (pl.col('published_time_diff') / pl.col('min_published_time_diff')).alias('published_time_diff_norm')
    ).drop('min_published_time_diff')


    # If the user is a subscriber and the article is premium, then 1, otherwise 0
    df = df.with_columns(
        pl.when(pl.col('premium') & pl.col('is_subscriber')).then(1).otherwise(0).alias('is_premium_subscriber')
    )

    df = df.join(articles.select(["article_id", "category"]), how="left", on="article_id")

    assert _len == len(df)

    return df

### Last Impression Time Difference Features

In [16]:
def feat_user_last_impression_publish_time_diff(
    df: pl.DataFrame,
    df_history: pl.DataFrame,
    articles: pl.DataFrame
) -> pl.DataFrame:
    """Add the difference between the last impression time and the published time of the article."""

    shape = df.shape

    df_history = df_history.explode(['impression_time_fixed', 'scroll_percentage_fixed', 'article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id', 'impression_time_history', 'scroll_percentage_history', 'article_id','read_time_history']

    # Get the last impression time of the user
    user_last_impression_time = df_history.groupby('user_id').agg(
        pl.max('impression_time_history').alias('user_last_impression_time')
    )

    df = df.join(
        user_last_impression_time, on='user_id', how='left'
        ).join(
            articles.select(['article_id','published_time']), on='article_id', how='left'
        )

    # Calculate the difference between the last impression time and the published time
    df = df.with_columns(
        ((pl.col('user_last_impression_time') - pl.col('published_time')) / timedelta(days=1)).alias('user_last_impression_time-publish_time_diff')
    ).drop(['user_last_impression_time','published_time'])

    assert df.shape[0] == shape[0]

    return df

### Article Inview Population Features

In [17]:
def feat_article_pop(
    df:pl.DataFrame,
    mode:str,
    df_pop_path:str = '/home/data/article_pop_inview/',
    metacol:str=None
) -> pl.DataFrame:
    _len = len(df)

    # Load the pre-computed article popularity
    time_interval_list = ['1m','2m','3m','5m','10m','15m','20m','30m','1h','2h','3h','6h','12h','24h'] 

    for time_interval in time_interval_list:
        if not metacol:
            file_path = f'{df_pop_path}{mode}_article_pop_inview_{time_interval}.parquet'
        else:
            file_path = f'{df_pop_path}{mode}_article_pop_inview_{time_interval}_{metacol}.parquet'

        try:
            df_pop = load_my_vector_df(file_path)
        except:
            logger.warning(f'{file_path} not found')
            continue

        df = df.with_columns(
            pl.col("impression_time").dt.truncate(time_interval).alias(f'rounded_{time_interval}_datetime')
        )

        if not metacol:
            df = df.join(df_pop, on=['article_id',f'rounded_{time_interval}_datetime'], how='left').drop(
                [f'rounded_{time_interval}_datetime']
            )
        else:
            df = df.join(df_pop, on=['article_id',f'rounded_{time_interval}_datetime',metacol], how='left').drop(
                [f'rounded_{time_interval}_datetime']
            )

        if not metacol:
            colname = f'rounded_{time_interval}_inview_count'
        else:
            colname = f'rounded_{time_interval}_inview_count_{metacol}'

        df = df.join(
            df.groupby('impression_id').agg(
                pl.max(colname).alias(f'max_{colname}')
            ), on='impression_id', how='left'
        )

        df = df.with_columns(
            (pl.col(colname) / pl.col(f'max_{colname}')).alias(f'{colname}_norm')
        ).drop(f'max_{colname}')

    assert _len == len(df)

    return df

## Add Statistics of Strong Features

In [18]:
def add_statistic(
        df: pl.DataFrame
) -> pl.DataFrame:
    """Add statistics of strong features"""
    target_cols = ['total_inviews', 'total_pageviews', 'total_read_time', 'rounded_1m_inview_count', 'rounded_2m_inview_count',
                     'rounded_3m_inview_count','rounded_5m_inview_count','rounded_10m_inview_count','rounded_15m_inview_count','rounded_20m_inview_count',
                     'rounded_30m_inview_count','rounded_1h_inview_count','rounded_3h_inview_count','rounded_6h_inview_count','rounded_12h_inview_count','rounded_24h_inview_count',
                     'inview_count', 'past_inview_count', 'time_gap_to_next_inview_impression_time', 'time_gap_from_prev_inview_impression_time', 'published_time_diff'
                    ]

    operations = ["max", "min", "mean", "median", "std", "skew", "kurtosis"]

    aggregations = []

    for col in target_cols:
        for op in operations:
            agg_expr = getattr(pl.col(col), op)().alias(f"{col}_{op}")
            aggregations.append(agg_expr)

    df_agg = df.groupby("impression_id").agg(aggregations)
    df = df.join(df_agg, on="impression_id", how="left")
    
    for col in target_cols:
        if col in ['total_read_time','total_inviews', 'total_pageviews']:
            continue

        for op in ["max", "mean", "median", "min"]:
            df = df.with_columns((pl.col(f"{col}_{op}") - pl.col(f"{col}")).alias(f"{col}_{op}-{col}"))
            df = df.with_columns((pl.col(f"{col}_{op}") / pl.col(f"{col}")).alias(f"{col}_{op}/{col}"))
    
    return df

### Past Category Feature

In [19]:
def add_past_category_ratios(
        df: pl.DataFrame, 
        df_history: pl.DataFrame
) -> pl.DataFrame:
    """Add past category ratios to the dataframe"""

    df_history = df_history.explode(['impression_time_fixed','scroll_percentage_fixed','article_id_fixed','read_time_fixed'])
    df_history.columns = ['user_id','impression_time_history','scroll_percentage_history','article_id','read_time_history']
    df_history = df_history.fill_null(0)

    # Joining training history data with article data
    df_history_articles = df_history.join(articles, on='article_id', how='left')
    user_category_df = df_history_articles.groupby(["user_id", "category"]).count().rename({"count": "past_category_count"})

    user_category_df = user_category_df.with_columns(pl.col("past_category_count").max().over("user_id").alias("max_past_category_count_count"))
    user_category_df = user_category_df.with_columns((pl.col("past_category_count") / pl.col("max_past_category_count_count")).alias("past_category_ratio"))


    df = df.join(user_category_df, on=["user_id", "category"], how="left")

    return df

### Inview Count features

In [20]:
def calculate_inview_counts(df):
    """Calculate inview counts for each user_id and article_id"""
    df = df.sort("impression_time")

    df = df.with_columns(
        [
            pl.col("impression_id").count().over(["user_id", "article_id"]).alias("inview_count"),
            pl.col("impression_id").cumcount().over(["user_id", "article_id"]).alias("past_inview_count")
        ]
    )
    
    df = df.with_columns([
        pl.col("impression_time").shift(1).over(["user_id", "article_id"]).alias("next_inview_impression_time"),
        pl.col("impression_time").shift(-1).over(["user_id", "article_id"]).alias("prev_inview_impression_time")
    ])

    df = df.with_columns(
        [
            ((pl.col("next_inview_impression_time") - pl.col("impression_time")) / timedelta(hours=1)).alias("time_gap_to_next_inview_impression_time"),
            ((pl.col("impression_time") - pl.col("prev_inview_impression_time")) / timedelta(hours=1)).alias("time_gap_from_prev_inview_impression_time"),
        ]
    ).drop(["next_inview_impression_time", "prev_inview_impression_time"])
    
    return df

### Cossim by Category Features

In [21]:
def add_cossim_by_category(
        behaviors_df: pl.DataFrame, 
        history_df: pl.DataFrame, 
        emb_df: pl.DataFrame, 
        cossim_name: str, 
        cat_col: str
) -> pl.DataFrame:
    """Cosine similarity between the user's past actions and the article vector by category."""
    
    cat_vals = behaviors_df[cat_col].unique().to_list()
    cat_weight_df = behaviors_df.select(["user_id", cat_col])

    weight_cols = []
    for cat_val in cat_vals:
        cat_weight_df = cat_weight_df.with_columns((pl.col(cat_col)==cat_val).alias(f"{cat_col}__{cat_val}"))
        weight_cols.append(f"{cat_col}__{cat_val}")
    cat_weight_df = cat_weight_df.groupby("user_id").agg([pl.col(col).mean() for col in weight_cols])

    _history_df = history_df.join(
        cat_weight_df,
        on="user_id",
        how="left",
    )

    user_emb_df = _history_df.join(
        emb_df.rename({col: f"userx__{col}" for col in emb_df.columns if col != "article_id"}),
        on="article_id",
        how="left",
    )
    user_emb_cols = [col for col in user_emb_df.columns if "userx__" in col]

    weighted_emb = None
    for weight_col in weight_cols:
        _weighted_emb = (user_emb_df.select(user_emb_cols) * user_emb_df[weight_col]).sum() / user_emb_df[weight_col].sum()
        if weighted_emb is not None:
            weighted_emb += cat_weight_df[weight_col] * _weighted_emb
        else:
            weighted_emb = cat_weight_df[weight_col] * _weighted_emb
    weighted_emb = pl.concat([cat_weight_df.select("user_id"), weighted_emb], how="horizontal")
    _behaviors_df = behaviors_df.join(weighted_emb, on="user_id", how="left")

    _behaviors_df = _behaviors_df.join(
        emb_df.rename({col: f"article__{col}" for col in emb_df.columns if col != "article_id"}),
        on="article_id",
        how="left",
    )
    article_emb_cols = [col for col in _behaviors_df.columns if "article__" in col]

    cossims = (_behaviors_df[user_emb_cols] * _behaviors_df[article_emb_cols]).sum(axis=1) / \
        (np.linalg.norm(_behaviors_df[user_emb_cols], axis=1) * np.linalg.norm(_behaviors_df[article_emb_cols], axis=1))
    behaviors_df = behaviors_df.with_columns(
        pl.lit(cossims).alias(cossim_name),
    )

    return behaviors_df

### Normalize Rank Columns

In [22]:
def normalize_rank_by_percentage(
        df: pl.DataFrame
    ) -> pl.DataFrame:
    """Normalize the rank columns by the percentage of the impression_id."""
    
    rank_cols = [
        'published_time_diff_rank',
        'total_inviews_norm_rank',
        'total_pageviews_norm_rank',
        'total_read_time_norm_rank',
        'rounded_1m_inview_count_rank',
        'rounded_2m_inview_count_rank',
        'rounded_3m_inview_count_rank',
        'rounded_5m_inview_count_rank',
        'rounded_10m_inview_count_rank',
        'rounded_15m_inview_count_rank',
        'rounded_20m_inview_count_rank',
        'rounded_30m_inview_count_rank',
        'rounded_1h_inview_count_rank',
        'rounded_3h_inview_count_rank',
        'rounded_6h_inview_count_rank',
        'rounded_12h_inview_count_rank',
        'rounded_24h_inview_count_rank',
        'inview_count_rank',
        'past_inview_count_rank',
        'time_gap_to_next_inview_impression_time_rank',
        'time_gap_from_prev_inview_impression_time_rank',
        'next_impression_id_mean_cossim_rank',
        'next_impression_id_min_cossim_rank',
        'next_impression_id_max_cossim_rank'
    ]

    impression_id_value_counts = df.groupby("impression_id").count().rename({"count": "impression_id_article_counts"})
    df = df.join(impression_id_value_counts, how="left", on="impression_id")

    expressions = [pl.col(col) / pl.col("impression_id_article_counts") for col in rank_cols]
    df = df.with_columns(expressions).drop("impression_id_article_counts")
    return df

### Next Impression ID Cosine Similarities

In [23]:
def add_next_impression_id_cossim(
        df: pl.DataFrame, 
        df_vec: pl.DataFrame, 
        method: str = "mean"
) -> pl.DataFrame:
    
    colname = f"next_impression_id_{method}_cossim"

    df = df.sort(["session_id", "impression_time"], descending=[False, False])

    tmp_df = df.select(["impression_id", "session_id", "impression_time"])
    tmp_df = tmp_df.unique(["impression_id", "session_id", "impression_time"])
    tmp_df = tmp_df.sort(["session_id", "impression_time"], descending=[False, False])
    tmp_df = tmp_df.with_columns(pl.col("impression_id").shift(-1).over("session_id").alias("next_impression_id"))

    df = df.join(tmp_df.select(["impression_id", "next_impression_id"]), how="left", on="impression_id")

    df_next_impression_id = df.filter(df["impression_id"].is_in(set(df["next_impression_id"]))).select(["impression_id", "article_id"])
    df_next_impression_id = df_next_impression_id.join(df_vec, how="left", on="article_id")
    
    vec_cols = [col for col in df_next_impression_id.columns if 'vector' in col]
    
    if method == "mean":
        next_impression_id_emb = df_next_impression_id.groupby('impression_id').agg(
                *[pl.mean(col).alias(col) for col in vec_cols]
                )
    elif method == "min":
        next_impression_id_emb = df_next_impression_id.groupby('impression_id').agg(
                *[pl.min(col).alias(col) for col in vec_cols]
                )
    elif method == "max":
        next_impression_id_emb = df_next_impression_id.groupby('impression_id').agg(
                *[pl.max(col).alias(col) for col in vec_cols]
                )
    
    next_impression_id_emb.columns = [f'next_impression_id_{col}' if col != 'impression_id' else col for col in next_impression_id_emb.columns]

    df_current_impression_id = df.select(["impression_id", "article_id", "next_impression_id"])
    df_current_impression_id = df_current_impression_id.join(df_vec, how="left", on="article_id")

    df_current_impression_id.columns = [f'current_impression_id_{col}' if col not in ['article_id', 'impression_id', 'next_impression_id'] else col for col in df_current_impression_id.columns]

    df_crossed = df_current_impression_id.join(next_impression_id_emb, how="left", left_on=["next_impression_id"], right_on=["impression_id"])
    df_crossed = df_crossed.drop_nulls(["next_impression_id"])

    current_impression_id_vec = df_crossed.select([col for col in df_crossed.columns if 'current_impression_id_' in col]).to_numpy()
    next_impression_id_vec = df_crossed.select([col for col in df_crossed.columns if 'next_impression_id_' in col]).to_numpy()

    current_impression_id_vec = torch.tensor(current_impression_id_vec)
    next_impression_id_vec = torch.tensor(next_impression_id_vec)

    similarity = F.cosine_similarity(
            current_impression_id_vec, next_impression_id_vec, dim=1)

    df_crossed = df_crossed.with_columns(
            pl.Series(similarity.numpy()).alias(colname)
            )

    feat_df = df_crossed.select(['impression_id','article_id',colname])
    df = df.join(feat_df, how="left", on=["impression_id", "article_id"]).drop("next_impression_id")
    return df

### Publish Time Related Features

In [24]:
def add_published_time_features(
        df: pl.DataFrame, 
        df_history: pl.DataFrame
) -> pl.DataFrame:
    """Add features related to the published time of the articles."""
    _df_history = df_history.explode(
        [
            "impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"
        ]
    ).rename({
        "impression_time_fixed": "impression_time", 
        "scroll_percentage_fixed": "scroll_percentage", 
        "article_id_fixed": "article_id", 
        "read_time_fixed": "read_time"
    })

    _df_history = _df_history.join(articles, how="left", on="article_id")
    _df_history = _df_history.with_columns(
            ((pl.col('impression_time') - pl.col('published_time')) / timedelta(days=1)).alias('published_time_diff')
    )
    _df_history = _df_history.groupby(["user_id"]).agg([
        pl.col("published_time_diff").mean().alias("mean_history_published_time_diff"),
        pl.col("published_time_diff").max().alias("max_history_published_time_diff"),
        pl.col("published_time_diff").min().alias("min_history_published_time_diff"),
        pl.col("published_time_diff").median().alias("median_history_published_time_diff"),
    ])
    
    df = df.join(_df_history, how="left", on="user_id")
    return df

## Run Feature Engineering

In [25]:
def create_feature(
        df: pl.DataFrame,
        df_history: pl.DataFrame,
        mode: str = 'train',
        logging: bool = True
) -> pl.DataFrame:
    
    if logging:
        logger.info('start:feature_engineering')
        logger.info(f'df shape: {df.shape}')
        current_time = datetime.now()
                
    df = feat_clicked_history_count(df,df_history)
    df = feat_clicked_history_count_by_article(df,df_history)

    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'clicked_history_count/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    df = feat_inview_cooccur(df,mode)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'inview_cooccur/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    df = feat_article_simple(df,articles)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'feat_article_simple/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    df = feat_user_last_impression_publish_time_diff(df,df_history,articles)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'user_last_impression_publish_time_diff/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
                    
    df = feat_article_pop(df,mode,df_pop_path='/home/data/article_pop_inview/',metacol=None)
    
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'article_pop/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    df = add_past_category_ratios(df,df_history)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'add_past_category_ratios/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
        
    df = add_next_impression_id_cossim(df, common_vec_dict["image"], method="mean")
    df = add_next_impression_id_cossim(df, common_vec_dict["image"], method="min")
    df = add_next_impression_id_cossim(df, common_vec_dict["image"], method="max")
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'add_next_impression_id_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    for k,v in common_vec_dict.items():
        for type in ['mean','scroll_mean','read_time_mean']:#'min', 'max', 
            for window in [1,0]:
                if window == 1 and type != 'mean':
                    continue

                df = feat_clicked_cossim(df,df_history,v,f'click_cossim_{k}_{type}_{window}',type,window)
            
            if logging:
                logger.info(f'k: {k}, type: {type}')
                elapsed_seconds = (datetime.now() - current_time).seconds
                current_time = datetime.now()
                logger.info(f'clicked_inview_common_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
                
            df = feat_clicked_category_cossim(df,df_history,v,f'category_click_cossim_{k}_{type}',type,window=0)
            if logging:
                logger.info(f'k: {k}, type: {type}')
                elapsed_seconds = (datetime.now() - current_time).seconds
                current_time = datetime.now()
                logger.info(f'category_click_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

        df = feat_inview_cossim(df,v,f'inview_cossim_{k}')

        if logging:
            logger.info(f'k: {k}')
            elapsed_seconds = (datetime.now() - current_time).seconds
            current_time = datetime.now()
            logger.info(f'inview_my_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

        df = add_cossim_individual_features(df, df_history, v, f"{k}_individual")
        if logging:
            logger.info(f'k: {k}')
            elapsed_seconds = (datetime.now() - current_time).seconds
            current_time = datetime.now()
            logger.info(f'add_cossim_individual_features/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
    

    for k,v in my_vec_dict.items():
        for type in ['mean','scroll_mean','read_time_mean']:
            for window in [1,2,3,0]:
                if window == 1 and type != 'mean':
                    continue
                #clickベース
                df = feat_clicked_cossim(df,df_history,v[mode],f'click_cossim_{k}_{type}_{window}',type,window)

                if logging:
                    logger.info(f'k: {k}, type: {type}')
                    elapsed_seconds = (datetime.now() - current_time).seconds
                    current_time = datetime.now()
                    logger.info(f'clicked_inview_my_cossim/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    cat_cols = ["device_type", "is_subscriber", "is_sso_user"]
    for k,v in common_vec_dict.items():
        if k == 'one_hot':
            continue
        for cat_col in cat_cols:
            df = add_cossim_by_category(
                df,
                df_history.rename(
                    {"impression_time_fixed": "impression_time",
                     "scroll_percentage_fixed": "scroll_percentage",
                     "article_id_fixed": "article_id",
                     "read_time_fixed": "read_time"}
                ).explode(
                    ["impression_time", "scroll_percentage", "article_id", "read_time"]
                ).sort(
                    ["user_id", "impression_time"]
                ),
                v,
                f"cossim__{k}__{cat_col}", cat_col)
        
            if logging:
                logger.info(f'k: {k}, cat_col: {cat_col}')
                elapsed_seconds = (datetime.now() - current_time).seconds
                current_time = datetime.now()
                logger.info(f'add_cossim_by_category/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
        
    df = calculate_inview_counts(df)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'calculate_inview_counts/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')

    for col in df.columns:
        if df[col].dtype == pl.Boolean:
            df = df.with_columns(pl.col(col).cast(pl.Int32))
            
    df = add_statistic(df)
    if logging:
        elapsed_seconds = (datetime.now() - current_time).seconds
        current_time = datetime.now()
        logger.info(f'statistic/elapsed_seconds: {elapsed_seconds}/df shape: {df.shape}')
    
    df = df.with_columns([
        (pl.col('total_pageviews') / pl.col('total_inviews')).alias('pageviews/inviews'),
        (pl.col('total_read_time') / pl.col('total_inviews')).alias('read_time/inviews'),
        pl.col('published_time_diff').rank().over('impression_id').alias('published_time_diff_rank'),
    ])

    rank_cols = ['total_inviews_norm', 'total_pageviews_norm', 'total_read_time_norm', 'rounded_1m_inview_count', 'rounded_2m_inview_count',
                 'rounded_3m_inview_count','rounded_5m_inview_count','rounded_10m_inview_count','rounded_15m_inview_count','rounded_20m_inview_count',
                 'rounded_30m_inview_count','rounded_1h_inview_count','rounded_3h_inview_count','rounded_6h_inview_count','rounded_12h_inview_count','rounded_24h_inview_count',
                 'inview_count', 'past_inview_count', 'time_gap_to_next_inview_impression_time', 'time_gap_from_prev_inview_impression_time', 
                 'next_impression_id_mean_cossim', 'next_impression_id_min_cossim', 'next_impression_id_max_cossim', 
                 'published_time_diff','published_time_diff_min-published_time_diff'
                ]
    for col in rank_cols:
        df = df.with_columns(pl.col(col).rank(descending =True).over('impression_id').alias(f'{col}_rank'))

    df = df.with_columns([
        df['impression_time'].dt.hour().alias('hour'),
        df['impression_time'].dt.minute().alias('minute'),
        df['impression_time'].dt.second().alias('second'),
        df['impression_time'].dt.weekday().alias('weekday')
    ])    
    
    df = normalize_rank_by_percentage(df)
    
    grouped_df = df.groupby("article_id").agg(
        [
            pl.col("count_article_ids_inview").mean().alias("mean_count_article_ids_inview"),
            pl.col("count_article_ids_inview").min().alias("min_count_article_ids_inview"),
            pl.col("count_article_ids_inview").median().alias("median_count_article_ids_inview"),
        ]
    )
    df = df.join(grouped_df, on=["article_id"], how="left")
    
    df = add_published_time_features(df, df_history)

    return df

# Train

In [26]:
os.makedirs(OUTPUT_DIR/'train', exist_ok=True)

trn_df = get_target_df(trn_behaviors)
trn_df = create_feature(trn_df, trn_history, mode='train')

trn_df.write_parquet(OUTPUT_DIR / 'train' / 'trn_df.parquet')

[2024-07-07 12:59:53,667][INFO] is_clicked_rate: 0.09040443631956259
[2024-07-07 12:59:53,669][INFO] df shape: (2585747, 15)
[2024-07-07 12:59:53,671][INFO] start:feature_engineering
[2024-07-07 12:59:53,672][INFO] df shape: (2585747, 15)
[2024-07-07 13:00:02,998][INFO] clicked_history_count/elapsed_seconds: 9/df shape: (2585747, 18)
[2024-07-07 13:00:09,560][INFO] inview_cooccur/elapsed_seconds: 6/df shape: (2585747, 28)
[2024-07-07 13:00:10,796][INFO] feat_article_simple/elapsed_seconds: 1/df shape: (2585747, 43)
[2024-07-07 13:00:11,068][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (2585747, 44)
[2024-07-07 13:00:20,250][INFO] article_pop/elapsed_seconds: 9/df shape: (2585747, 72)
[2024-07-07 13:00:41,741][INFO] add_past_category_ratios/elapsed_seconds: 21/df shape: (2585747, 75)
[2024-07-07 13:01:42,287][INFO] add_next_impression_id_cossim/elapsed_seconds: 60/df shape: (2585747, 78)
[2024-07-07 13:02:37,024][INFO] k: image, type: mean
[2024-07-07 13:02:

# Valid

In [27]:
os.makedirs(OUTPUT_DIR/'validation', exist_ok=True)

val_df = get_target_df(val_behaviors)
val_df = create_feature(val_df, val_history, mode='valid')

val_df.write_parquet(OUTPUT_DIR / 'validation' / 'val_df.parquet')

[2024-07-07 13:17:44,189][INFO] is_clicked_rate: 0.08386031543130591
[2024-07-07 13:17:44,191][INFO] df shape: (2928942, 15)
[2024-07-07 13:17:44,192][INFO] start:feature_engineering
[2024-07-07 13:17:44,193][INFO] df shape: (2928942, 15)
[2024-07-07 13:17:53,529][INFO] clicked_history_count/elapsed_seconds: 9/df shape: (2928942, 18)
[2024-07-07 13:18:00,595][INFO] inview_cooccur/elapsed_seconds: 7/df shape: (2928942, 28)
[2024-07-07 13:18:01,335][INFO] feat_article_simple/elapsed_seconds: 0/df shape: (2928942, 43)
[2024-07-07 13:18:01,621][INFO] user_last_impression_publish_time_diff/elapsed_seconds: 0/df shape: (2928942, 44)
[2024-07-07 13:18:15,447][INFO] article_pop/elapsed_seconds: 13/df shape: (2928942, 72)
[2024-07-07 13:18:28,020][INFO] add_past_category_ratios/elapsed_seconds: 12/df shape: (2928942, 75)
[2024-07-07 13:19:26,610][INFO] add_next_impression_id_cossim/elapsed_seconds: 58/df shape: (2928942, 78)
[2024-07-07 13:20:29,006][INFO] k: image, type: mean
[2024-07-07 13:20