In [10]:
import pandas as pd
import random

# Load data from Parquet files
articles = pd.read_parquet('Data/articles.parquet')  # article_id (i32), title (str)
train_behaviors = pd.read_parquet('Data/behaviors_train.parquet')  # user_id, article_ids_clicked, article_ids_inview
val_behaviors = pd.read_parquet('Data/behaviors_val.parquet')      # same structure
history = pd.read_parquet('Data/history_train.parquet')            # user_id (u32), article_id_fixed (list[i32])

# Map article_id to a unique index
newsindex = {'NULL': 0}  # Add a NULL key for padding
for i, aid in enumerate(articles['article_id'].unique()):
    newsindex[aid] = len(newsindex)

# Helper function to sample negative examples
def newsample(array, ratio):
    if len(array) == 0:
        return []
    if ratio > len(array):
        return random.sample(array * (ratio // len(array) + 1), ratio)
    else:
        return random.sample(array, ratio)

# Sampling configuration
npratio = 4  # Number of negative samples per positive sample
MAX_HISTORY_LEN = 50

# Function to process behaviors data
def process_behaviors(behaviors, newsindex, history_data=None):
    train_candidate = []    
    train_label = []
    train_user_his = []

    # Build a user history dictionary from history data if provided
    user_history = {}
    if history_data is not None:
        for _, row in history_data.iterrows():
            user_history[row['user_id']] = [newsindex.get(aid, 0) for aid in row['article_id_fixed']]

    for _, row in behaviors.iterrows():
        user_id = row['user_id']
        
        # Clicked articles (positive examples)
        clicked = [newsindex.get(aid, 0) for aid in row['article_ids_clicked'] if aid in newsindex]
        # Non-clicked articles (negative examples)
        inview = set(row['article_ids_inview'])
        non_clicked = [newsindex.get(aid, 0) for aid in inview if aid in newsindex and aid not in row['article_ids_clicked']]

        # User history
        if user_id in user_history:
            clickids = user_history[user_id][-MAX_HISTORY_LEN:]
        else:
            clickids = clicked[-MAX_HISTORY_LEN:]

        for pos_doc in clicked:
            neg_docs = newsample(non_clicked, npratio)
            candidates = neg_docs + [pos_doc]
            labels = [0] * npratio + [1]

            # Shuffle candidates and labels
            shuffle_indices = list(range(len(candidates)))
            random.shuffle(shuffle_indices)
            shuffled_candidates = [candidates[i] for i in shuffle_indices]
            shuffled_labels = [labels[i] for i in shuffle_indices]

            # Append training data
            train_candidate.append(shuffled_candidates)
            train_label.append(shuffled_labels)
            train_user_his.append(clickids + [0] * (MAX_HISTORY_LEN - len(clickids)))
    
    return train_candidate, train_label, train_user_his

# Process train behaviors
train_candidate, train_label, train_user_his = process_behaviors(train_behaviors, newsindex, history)

# Process validation behaviors
val_candidate, val_user_his, val_labels, val_index = [], [], [], []
for _, row in val_behaviors.iterrows():
    user_id = row['user_id']
    clicked = [newsindex.get(aid, 0) for aid in row['article_ids_clicked'] if aid in newsindex]
    #inview = set(row['article_ids_inview'])
    #non_clicked = [newsindex.get(aid, 0) for aid in inview if aid in newsindex and aid not in row['article_ids_clicked']]

    user_history = clicked[-MAX_HISTORY_LEN:]
    user_history = user_history + [0] * (MAX_HISTORY_LEN - len(user_history))


    start_idx = len(val_candidate)

    for aid in row['article_ids_inview']:
        if aid in newsindex:
            val_candidate.append(newsindex[aid])
            val_user_his.append(user_history)
            val_labels.append(1 if aid in row['article_ids_clicked'] else 0)

    end_idx = len(val_candidate)
    val_index.append([start_idx, end_idx])

# Print summary
print("Training Data:")
print("Number of training candidates:", len(train_candidate))
print("Number of training labels:", len(train_label))
print("Number of user histories:", len(train_user_his))

print("\nValidation Data:")
print("Number of validation candidates:", len(val_candidate))
print("Number of user histories:", len(val_user_his))
print("Test index ranges:", val_index)


Training Data:
Number of training candidates: 24888
Number of training labels: 24888
Number of user histories: 24888

Validation Data:
Number of validation candidates: 304915
Number of user histories: 304915
Test index ranges: [[0, 6], [6, 13], [13, 23], [23, 31], [31, 38], [38, 46], [46, 54], [54, 59], [59, 64], [64, 69], [69, 75], [75, 80], [80, 93], [93, 123], [123, 140], [140, 173], [173, 182], [182, 188], [188, 193], [193, 198], [198, 210], [210, 220], [220, 225], [225, 241], [241, 256], [256, 261], [261, 275], [275, 283], [283, 301], [301, 306], [306, 314], [314, 326], [326, 332], [332, 338], [338, 348], [348, 354], [354, 360], [360, 365], [365, 379], [379, 385], [385, 393], [393, 400], [400, 408], [408, 420], [420, 425], [425, 438], [438, 443], [443, 450], [450, 455], [455, 461], [461, 467], [467, 475], [475, 487], [487, 498], [498, 510], [510, 516], [516, 523], [523, 530], [530, 535], [535, 545], [545, 550], [550, 557], [557, 562], [562, 571], [571, 577], [577, 597], [597, 602]