Tokenize

In [1]:
import pandas as pd
import torch
from collections import defaultdict
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence

# define function

def generate_user_tokenized_titles(history_df, articles_df):
    """
    Generates a list of tokenized titles for each user based on their reading history.

    Args:
        history_df (pd.DataFrame): DataFrame containing user reading history with columns:
                                   'user_id' and 'article_id_fixed'.
        articles_df (pd.DataFrame): DataFrame containing article information with columns:
                                    'article_id' and 'title'.

    Returns:
        list: A list of tokenized article titles as tensors for each user.
    """

    # Step 1: Create a vocabulary with special tokens
    vocabulary = defaultdict(lambda: len(vocabulary))  # Default dictionary for token ids
    vocabulary["<PAD>"] = 0  # Padding token
    vocabulary["<UNK>"] = 1  # Unknown token

    # Populate the vocabulary with words from titles
    for title in articles_df['title']:
        tokens = word_tokenize(title.lower())
        for token in tokens:
            _ = vocabulary[token]  # Assign an index if the token is not already in the vocabulary

    # Convert vocabulary to a standard dictionary for better control
    vocabulary = dict(vocabulary)

    # Step 2: Tokenize each title into a list of indices based on the vocabulary
    tokenized_titles = {}
    for _, row in articles_df.iterrows():
        tokens = word_tokenize(row['title'].lower())
        tokenized_titles[row['article_id']] = torch.tensor(
            [vocabulary.get(token, vocabulary["<UNK>"]) for token in tokens]
        )

    # Step 3: Create a list of tokenized titles for each user
    user_articles = defaultdict(list)
    for _, row in history_df.iterrows():
        user_id = row['user_id']
        article_id = row['article_id_fixed']
        if article_id in tokenized_titles:
            user_articles[user_id].append(tokenized_titles[article_id])

    # Step 4: Pad tokenized titles for each user
    user_tokenized_titles = {
        user: pad_sequence(articles, batch_first=True, padding_value=vocabulary["<PAD>"])
        for user, articles in user_articles.items()
    }

    # Convert user_tokenized_titles from a dictionary to a list
    user_tokenized_titles_list = list(user_tokenized_titles.values())

    return user_tokenized_titles_list

In [3]:
# test

# Sample DataFrames
history_train = pd.DataFrame({
    "user_id": [1, 1, 2, 3],
    "article_id_fixed": [101, 102, 103, 104]
})

articles = pd.DataFrame({
    "article_id": [101, 102, 103, 104],
    "title": ["I want to sleep", "Deep Learning Basics", "AI is the future", "Understanding the Transformers"]
})

# Call the function
user_tokenized_titles_list = generate_user_tokenized_titles(history_train, articles)

# Display the result
print(user_tokenized_titles_list)

[tensor([[2, 3, 4, 5],
        [6, 7, 8, 0]]), tensor([[ 9, 10, 11, 12]]), tensor([[13, 11, 14]])]
