# Load Libraries

In [None]:
import os

from pathlib import Path

import pandas as pd
import polars as pl


import lightgbm as lgb

import warnings

warnings.filterwarnings('ignore')

# Logging
import logging

# Get logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
file_handler = logging.FileHandler('logs.log')

# Set log format
formatter = logging.Formatter('[%(asctime)s][%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

# Settings to display log on notebook
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# Inference & Ensembling

In [None]:
articles = pl.read_parquet('/home/data/ebnerd_testset/articles.parquet')

lgb_model_part1 = lgb.Booster(model_file='lgb_model_20p_1.txt')
lgb_model_part2 = lgb.Booster(model_file='lgb_model_20p_2.txt')
lgb_model_part3 = lgb.Booster(model_file='lgb_model_20p_3.txt')
lgb_model_part4 = lgb.Booster(model_file='lgb_model_20p_4.txt')
lgb_model_part5 = lgb.Booster(model_file='lgb_model_20p_5.txt')
lgb_model_part6 = lgb.Booster(model_file='lgb_model_20p_6.txt')
lgb_model_part7 = lgb.Booster(model_file='lgb_model_20p_7.txt')
lgb_model_part8 = lgb.Booster(model_file='lgb_model_20p_8.txt')

feature_cols = lgb_model_part1.feature_name()

In [None]:
os.makedirs("../test_predictions/", exist_ok=True)
os.makedirs("../test_predictions/sub_df_orgs/", exist_ok=True)
os.makedirs("../test_predictions/sub_dfs/", exist_ok=True)

In [None]:
sub_df_list = []
sub_df_list_org = []

for i in range(100):
    logger.info(f'no : {i}')
    
    if f"sub_df_org_chunk{i}.parquet" in os.listdir("../test_predictions/sub_df_orgs/"):
        continue
    
    pl.DataFrame().write_parquet(f"../test_predictions/sub_df_orgs/sub_df_org_chunk{i}.parquet")
    
    pl.DataFrame()
    
    test_df = pl.read_parquet(f"../test_large_chunks/test_df_chunk{i}.parquet")
    test_df = test_df.join(articles.select(["article_id", "category"]), how="left", on="article_id")
    X_test = test_df.select(feature_cols).to_pandas()
    
    X_test["category"] = X_test["category"].astype("category")
    
    logger.info(f'starting prediction')
    lgb_pred_part1 = lgb_model_part1.predict(X_test)
    lgb_pred_part2 = lgb_model_part2.predict(X_test)
    lgb_pred_part3 = lgb_model_part3.predict(X_test)
    lgb_pred_part4 = lgb_model_part4.predict(X_test)
    lgb_pred_part5 = lgb_model_part5.predict(X_test)
    lgb_pred_part6 = lgb_model_part6.predict(X_test)
    lgb_pred_part7 = lgb_model_part7.predict(X_test)
    lgb_pred_part8 = lgb_model_part8.predict(X_test)
    
    logger.info(f'organizing results')
    sub_df_org = pl.DataFrame(
        {
            'impression_id': test_df['impression_id'],
            'article_id': test_df['article_id'],
            'user_id': test_df['user_id'],
            'y_pred1': lgb_pred_part1,
            'y_pred2': lgb_pred_part2,
            'y_pred3': lgb_pred_part3,
            'y_pred4': lgb_pred_part4,
            'y_pred5': lgb_pred_part5,
            'y_pred6': lgb_pred_part6,
            'y_pred7': lgb_pred_part7,
            'y_pred8': lgb_pred_part8,
        }
    )
    
    # Ensembles scores based on ranking of 8 models.
    sub_df = sub_df_org.with_columns([
            pl.col('y_pred1').rank().over('impression_id').alias('y_pred1'),
            pl.col('y_pred2').rank().over('impression_id').alias('y_pred2'),
            pl.col('y_pred3').rank().over('impression_id').alias('y_pred3'),
            pl.col('y_pred4').rank().over('impression_id').alias('y_pred4'),
            pl.col('y_pred5').rank().over('impression_id').alias('y_pred5'),
            pl.col('y_pred6').rank().over('impression_id').alias('y_pred6'),
            pl.col('y_pred7').rank().over('impression_id').alias('y_pred7'),
            pl.col('y_pred8').rank().over('impression_id').alias('y_pred8'),
    ]).with_columns([
        (
                    pl.col('y_pred1')*0.125 + \
                    pl.col('y_pred2')*0.125 + \
                    pl.col('y_pred3')*0.125 + \
                    pl.col('y_pred4')*0.125 + \
                    pl.col('y_pred5')*0.125 + \
                    pl.col('y_pred6')*0.125 + \
                    pl.col('y_pred7')*0.125 + \
                    pl.col('y_pred8')*0.125
        ).alias('pred')
    ])
    
    # Ranks prediction based on ensemble scores
    sub_df = sub_df.groupby(['impression_id','user_id']).agg(
        pl.col('pred').rank(method = 'ordinal',descending = True).alias('prediction_scores')
    )
    
    sub_df_list.append(sub_df)
    sub_df_list_org.append(sub_df_org)
    
    os.remove(f"../test_predictions/sub_df_orgs/sub_df_org_chunk{i}.parquet")
    
    sub_df_org.write_parquet(f"../test_predictions/sub_df_orgs/sub_df_org_chunk{i}.parquet")
    sub_df.write_parquet(f"../test_predictions/sub_dfs/sub_df_chunk{i}.parquet")