In [2]:
"""
NPA [1] is a news recommendation model with personalized attention. 
The core of NPA is a news representation model and a user representation model. 
In the news representation model we use a CNN network to learn hidden representations of news articles based on their titles. 
In the user representation model we learn the representations of users based on the representations of their clicked news articles.
In addition, a word-level and a news-level personalized attention are used to capture different informativeness for different users.
"""


"""
StandardScaler in Scikit-Learn removes the mean and scales the data to unit variance,
 which is standard practice for continuous data in many machine learning models.
 
OneHotEncoder is used for categorical data to create a binary column for
 each category and is especially useful for non-ordinal categories.
 
"""
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


filePath_with_all_features = r"C:\Users\Michalina\bt4222\project\BT4222-project\xgboost_dataset_ebnerd_demo.parquet"
data = pd.read_parquet(filePath_with_all_features)



features_cont = ['read_time', 
                 'sentiment_score', 'user_average_read_time',
                 'user_average_scroll_percentage', 'user_impression_frequency',
                 'interaction_score']

features_cat = ['sentiment_label', 'user_mood','device_type',
                'is_sso_user', 'is_subscriber', 'premium',
                'category_encoded', 'subcategory_encoded',
                'favorite_category_encoded', 'least_favorite_category_encoded',
                'user_article_same_mood']



# Normalizing continuous features
continuous_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Encoding categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', continuous_transformer, features_cont),
        ('cat', categorical_transformer, features_cat)
    ])

X_processed = preprocessor.fit_transform(data)

# Get feature names after one-hot encoding for categorical data
feature_names = list(preprocessor.named_transformers_['cat'].named_steps['onehot']
                     .get_feature_names(input_features=features_cat))
feature_names = feature_names + features_cont  # Add continuous features' names

# Convert the numpy array returned by ColumnTransformer back to a DataFrame
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

X_processed_df.to_csv('processed_data_for_dl.csv', index=False)


#embeddings (11777, 17) where one dim of 17 is unique_id of articles
###############################################################################################################################################################



#TODO: ADJUSTING A DATALOADER FOR THE LATEST VERSION OF OUR DATA 



AttributeError: module 'numpy' has no attribute '__version__'

1. Data Structure in Batches
   -------------------------

To ensure that the model processes each session separately and provides separate rankings for each session, you need to carefully manage how data is batched:

Session Batch Processing:
=========================
> Each batch should contain multiple sessions, and within each session, 
you several articles.
 
Feature Organization:
=====================

For each session in a batch, you'll typically have:
- A matrix of article embeddings and features (one row per article).
- A single vector of dynamic features specific to that session.
- A single vector of static user features (which might be replicated across multiple sessions if they belong to the same user).


2. Model Input Handling
   --------------------
When processing each batch, your model needs to be aware of the boundaries of each session. This can be achieved by:

Padding and Masking: 
====================

> If sessions have different numbers of articles:
 -> we need to pad the sessions to have the same shape and use masking to ignore padded values during attention and subsequent calculations
 
Separate Processing:
====================

> We need to ensure that your model processes each session independently,
 -> especially during the attention mechanism, so that the article rankings are computed within the context of each session only
 



In [None]:
from Helpers.helpers import split_by_session_train_val_test
from NPA.SessionDataset import SessionDataset
from torch.utils.data import DataLoader

train_data, val_data, test_data  = split_by_session_train_val_test(X_processed_df,test_size=0.2, val_size=0.1)
embeddings=pd.read_parquet(r'C:\Users\Michalina\bt4222\project\BT4222-project\ebnerd_demo\reduced_embeddings.parquet')

# Create the datasets
train_dataset = SessionDataset(train_data,embeddings)
val_dataset = SessionDataset(val_data, embeddings)
test_dataset = SessionDataset(test_data,embeddings)




"""
Since each session can have a varying number of articles, 
DataLoader's collate function needs to handle batches 
where each element (session) could have different shapes or sizes.

DataLoader can effectively handle batches
that contain multiple sessions, each with a varying number of articles


Padding in order to have the same  length of the sesssion id articles :

Masking is a technique used to prevent the model from considering padded data points during training. It involves creating a mask that specifies which elements in the input data are actual data and which are padding.

"""

import torch

def custom_collate_fn(batch):
    max_articles = max(len(item['articles_features']) for item in batch)
    batch_articles_features = []
    batch_articles_embeddings = []
    batch_user_dynamic_features = []
    batch_user_static_features = []
    batch_targets = []
    masks = []  # To store the mask for targets

    for item in batch:
        num_articles = len(item['articles_features'])

        # Pad features and embeddings
        padded_features = torch.nn.functional.pad(item['articles_features'], (0, 0, 0, max_articles - num_articles))
        padded_embeddings = torch.nn.functional.pad(item['articles_embeddings'], (0, 0, 0, max_articles - num_articles))

        batch_articles_features.append(padded_features)
        batch_articles_embeddings.append(padded_embeddings)
        
        # User features do not need padding
        batch_user_dynamic_features.append(item['user_dynamic_features'])
        batch_user_static_features.append(item['user_static_features'])

        # Handle targets and create mask
        target = item['target']
        mask = torch.ones(num_articles, dtype=torch.bool)  # True for actual data
        if num_articles < max_articles:
            padding = max_articles - num_articles
            padded_target = torch.nn.functional.pad(target, (0, padding), value=-1)  # Pad target with an invalid class or value
            padded_mask = torch.nn.functional.pad(mask, (0, padding), value=False)  # False for padded data
        else:
            padded_target = target
            padded_mask = mask

        batch_targets.append(padded_target)
        masks.append(padded_mask)

    return {
        'articles_features': torch.stack(batch_articles_features),
        'articles_embeddings': torch.stack(batch_articles_embeddings),
        'user_dynamic_features': torch.stack(batch_user_dynamic_features),
        'user_static_features': torch.stack(batch_user_static_features),
        'target': torch.stack(batch_targets),
        'mask': torch.stack(masks)
    }



#Params 
batch_size = 1
num_workers = 1
shuffle_train = True 
shuffle_val_test = False

# Create DataLoader for training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle_train, 
                          num_workers=num_workers, collate_fn=custom_collate_fn)

# Create DataLoader for validation data
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=shuffle_val_test, 
                        num_workers=num_workers, collate_fn=custom_collate_fn)

# Create DataLoader for test data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle_val_test, 
                         num_workers=num_workers, collate_fn=custom_collate_fn)


Model training

In [None]:
from NPA.PairwiseRankingLoss import PairwiseRankingLoss
from torch import optim, nn
from NPA.NPAModel import NPAModel

article_features= ['category_encoded', 'subcategory_encoded','sentiment_score','sentiment_label','premium']


user_dynamic_features_col = ['device_type']

user_static_features_col =['user_mood',
                'is_sso_user', 'is_subscriber',
                'favorite_category_encoded', 'least_favorite_category_encoded',
                'user_article_same_mood','read_time', 'user_average_read_time',
                 'user_average_scroll_percentage', 'user_impression_frequency',
                 'interaction_score']

model = NPAModel(
    embedding_dim=16,
    num_filters=32,
    kernel_size=3,
    dynamic_feature_dim=len(user_dynamic_features_col),
    static_feature_dim=len(user_static_features_col),
    additional_article_features_dim=len(article_features)
    )

# Assuming model is already created and available
optimizer = optim.Adam(model.parameters(), lr=0.001)  # You can adjust learning rate as needed

# Define the loss function
criterion = PairwiseRankingLoss(margin=1.0)


num_epochs=10

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for data in train_loader:
        embeddings = data['articles_embeddings']
        features = data['articles_features']
        user_features = data['user_dynamic_features']
        static_features = data['user_static_features']
        targets = data['target']
        mask = data['mask']  # Make sure mask is correctly computed in DataLoader

        optimizer.zero_grad()

        outputs = model(embeddings, features, user_features, static_features)
        outputs = outputs.squeeze()  # Ensure output dimensions are correct

        loss = criterion(outputs, targets, mask)  # Pass the mask to the loss function
        loss.backward()
        optimizer.step()

        total_loss += loss.item()  # Assuming loss is already a scalar

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}")

    # Validation and testing steps should also apply masks if using padded data
