In [1]:
import pandas as pd
import random
from nltk.tokenize import word_tokenize

MAX_TITLE_LEN = 20

# Load data from Parquet files
articles = pd.read_parquet('Data/articles.parquet')  # article_id (i32), title (str)
train_behaviors = pd.read_parquet('Data/behaviors_train.parquet')  # user_id, article_ids_clicked, article_ids_inview
val_behaviors = pd.read_parquet('Data/behaviors_val.parquet')      # same structure
history_train = pd.read_parquet('Data/history_train.parquet')            # user_id (u32), article_id_fixed (list[i32])
history_val = pd.read_parquet('Data/history_val.parquet')

# Create a news dict and  article_id to a unique index
news = {}
newsindex = {'NULL': 0}  # Add a NULL key for padding
for idx, row in articles.iterrows():
    article_id = row['article_id']
    title = row['title'].lower()
    tokenized_title = word_tokenize(title)

    news[article_id] = tokenized_title
    newsindex[article_id] = len(newsindex)

# Summary
print("Number of unique articles:", len(news))
print("Example article ID and tokenized title:")
for k, v in list(news.items())[:3]:  # Print first 3 articles
    print(f"Article ID: {k}, Tokenized Title: {v}")

print("\nTotal articles indexed in newsindex:", len(newsindex), "\n")


# Helper function to sample negative examples
def newsample(array, ratio):
    if len(array) == 0:
        return []
    if ratio > len(array):
        return random.sample(array * (ratio // len(array) + 1), ratio)
    else:
        return random.sample(array, ratio)

# Sampling configuration
npratio = 4  # Number of negative samples per positive sample
MAX_HISTORY_LEN = 50

# Function to process behaviors data
def process_behaviors(behaviors, newsindex, history_data=None):
    train_candidate = []    
    train_label = []
    train_user_his = []

    # Build a user history dictionary from history data if provided
    user_history = {}
    if history_data is not None:
        for _, row in history_data.iterrows():
            user_history[row['user_id']] = [newsindex.get(aid, 0) for aid in row['article_id_fixed']]

    for _, row in behaviors.iterrows():
        user_id = row['user_id']
        
        # Clicked articles (positive examples)
        clicked = [newsindex.get(aid, 0) for aid in row['article_ids_clicked'] if aid in newsindex]
        # Non-clicked articles (negative examples)
        inview = set(row['article_ids_inview'])
        non_clicked = [newsindex.get(aid, 0) for aid in inview if aid in newsindex and aid not in row['article_ids_clicked']]

        # User history
        if user_id in user_history:
            clickids = user_history[user_id][-MAX_HISTORY_LEN:]
        else:
            clickids = clicked[-MAX_HISTORY_LEN:]

        for pos_doc in clicked:
            neg_docs = newsample(non_clicked, npratio)
            candidates = neg_docs + [pos_doc]
            labels = [0] * npratio + [1]

            # Shuffle candidates and labels
            shuffle_indices = list(range(len(candidates)))
            random.shuffle(shuffle_indices)
            shuffled_candidates = [candidates[i] for i in shuffle_indices]
            shuffled_labels = [labels[i] for i in shuffle_indices]

            # Append training data
            train_candidate.append(shuffled_candidates)
            train_label.append(shuffled_labels)
            train_user_his.append(clickids + [0] * (MAX_HISTORY_LEN - len(clickids)))
    
    return train_candidate, train_label, train_user_his

# Process train behaviors
train_candidate, train_label, train_user_his = process_behaviors(train_behaviors, newsindex, history_train)
val_candidate, val_label, val_user_his = process_behaviors(val_behaviors, newsindex, history_val)

""" # Process validation behaviors
val_candidate, val_user_his, val_labels, val_index = [], [], [], []
for _, row in val_behaviors.iterrows():
    user_id = row['user_id']
    clicked = [newsindex.get(aid, 0) for aid in row['article_ids_clicked'] if aid in newsindex]
    #inview = set(row['article_ids_inview'])
    #non_clicked = [newsindex.get(aid, 0) for aid in inview if aid in newsindex and aid not in row['article_ids_clicked']]

    user_history = clicked[-MAX_HISTORY_LEN:]
    user_history = user_history + [0] * (MAX_HISTORY_LEN - len(user_history))


    start_idx = len(val_candidate)

    for aid in row['article_ids_inview']:
        if aid in newsindex:
            val_candidate.append(newsindex[aid])
            val_user_his.append(user_history)
            val_labels.append(1 if aid in row['article_ids_clicked'] else 0)

    end_idx = len(val_candidate)
    val_index.append([start_idx, end_idx]) """

# Print summary
print("Training Data:")
print("Number of training candidates:", len(train_candidate))
print("Number of training labels:", len(train_label))
print("Number of user histories:", len(train_user_his))

print("\nValidation Data:")
print("Number of validation candidates:", len(val_candidate))
print("Number of validation labels:", len(val_label))
print("Number of user histories:", len(val_user_his))
#print("Number of users in validation data:", len(val_index))
#print("Validation index ranges (start and end for each user):")
#for idx, (start, end) in enumerate(val_index):
  #  print(f"  User {idx + 1}: Start = {start}, End = {end}, Number of candidates = {end - start}")


Number of unique articles: 11777
Example article ID and tokenized title:
Article ID: 3037230, Tokenized Title: ['ishockey-spiller', ':', 'jeg', 'troede', 'jeg', 'skulle', 'dø']
Article ID: 3044020, Tokenized Title: ['prins', 'harry', 'tvunget', 'til', 'dna-test']
Article ID: 3057622, Tokenized Title: ['rådden', 'kørsel', 'på', 'blå', 'plader']

Total articles indexed in newsindex: 11778 

Training Data:
Number of training candidates: 24888
Number of training labels: 24888
Number of user histories: 24888

Validation Data:
Number of validation candidates: 25505
Number of validation labels: 25505
Number of user histories: 25505


In [2]:
# Save Train Data
train_df = pd.DataFrame({
    'candidate': train_candidate,  
    'label': train_label,        
    'user_his': train_user_his   
})
train_df.to_parquet('Data/train_data.parquet', index=False)  

# Save Validation Data
val_df = pd.DataFrame({
    'candidate': val_candidate,   
    'label': val_label,            
    'user_his': val_user_his       
})
val_df.to_parquet('Data/val_data.parquet', index=False) 

# Print Confirmation
print("Train and Validation data saved as Parquet files:")
print("  - train_data.parquet")
print("  - val_data.parquet")


Train and Validation data saved as Parquet files:
  - train_data.parquet
  - val_data.parquet


In [2]:
print("Rasmus'")
import torch
print("mor")
import torch.nn
print("Polles")
from torch.nn.utils.rnn import pad_sequence
print("mor")

BATCH_SIZE = 10000

# Build vocabulary from tokenized titles
vocab = {'<PAD>': 0}  # Start with a padding token
for tokens in news.values():
    tokens = tokens[:MAX_TITLE_LEN]
    for word in tokens:
        if word not in vocab:
            vocab[word] = len(vocab)

print("Vocabulary built. Size: ", len(vocab))

# Convert titles to token indices
news_tensor = {}
for article_id, tokens in news.items():
    news_tensor[article_id] = torch.tensor([vocab.get(word,0) for word in tokens], dtype=torch.long)

print("Tokenized titles converted to tensors.")

news_tensors_list = list(news_tensor.values())
news_tensors_padded = []

# Process in smaller batches
for i in range(0, len(news_tensors_list), BATCH_SIZE):
    batch = news_tensors_list[i:i + BATCH_SIZE]
    padded_batch = pad_sequence(batch, batch_first=True, padding_value=vocab['<PAD>'])
    news_tensors_padded.append(padded_batch)

news_tensors_padded = torch.cat(news_tensors_padded, dim=0)

# Example Output
print("Shape of padded news tensors:", news_tensors_padded.shape)

Rasmus'
mor
Polles
mor
Vocabulary built. Size:  16003


: 