In [None]:
"""
NPA [1] is a news recommendation model with personalized attention. 
The core of NPA is a news representation model and a user representation model. 
In the news representation model we use a CNN network to learn hidden representations of news articles based on their titles. 
In the user representation model we learn the representations of users based on the representations of their clicked news articles.
In addition, a word-level and a news-level personalized attention are used to capture different informativeness for different users.
"""


"""
StandardScaler in Scikit-Learn removes the mean and scales the data to unit variance,
 which is standard practice for continuous data in many machine learning models.
 
OneHotEncoder is used for categorical data to create a binary column for
 each category and is especially useful for non-ordinal categories.
 
"""
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

filepth_all_data_wthout_embeddigs = ".csv"
data = pd.read_csv(filepth_all_data_wthout_embeddigs)


features_cont = ['read_time', 
                 'sentiment_score', 'user_average_read_time',
                 'user_average_scroll_percentage', 'user_impression_frequency',
                 'interaction_score']

features_cat = ['sentiment_label', 'user_mood','device_type',
                'is_sso_user', 'is_subscriber', 'premium',
                'category_encoded', 'subcategory_encoded',
                'favorite_category_encoded', 'least_favorite_category_encoded',
                'user_article_same_mood']



# Normalizing continuous features
continuous_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Encoding categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', continuous_transformer, features_cont),
        ('cat', categorical_transformer, features_cat)
    ])

X_processed = preprocessor.fit_transform(data)

# Get feature names after one-hot encoding for categorical data
feature_names = list(preprocessor.named_transformers_['cat'].named_steps['onehot']
                     .get_feature_names(input_features=features_cat))
feature_names = feature_names + features_cont  # Add continuous features' names

# Convert the numpy array returned by ColumnTransformer back to a DataFrame
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)

X_processed_df.to_csv('processed_data_for_dl.csv', index=False)


#embeddings (11777, 17) where one dim of 17 is unique_id of articles
###############################################################################################################################################################



#TODO: ADJUSTING A DATALOADER FOR THE LATEST VERSION OF OUR DATA 



Model training

In [None]:
from NPAModel import NPAModel
import torch 
import torch.nn as nn
import torch.optim as optim



def train_model(model, data_loader, optimizer, criterion, num_epochs):
    model.train()  # Set the model to training mode
    for epoch in range(num_epochs):
        total_loss = 0
        for articles, user_features, article_stats, clicks in data_loader:
            # Forward pass
            outputs = model(articles, user_features, article_stats)
            loss = criterion(outputs.squeeze(), clicks)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        # Print average loss for the epoch
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(data_loader)}')

In [None]:
from metrics import calculate_ndcg

def predict_model(model, data_loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for articles, user_features, article_stats, actual_clicks in data_loader:
            outputs = model(articles, user_features, article_stats)
            predictions.extend(outputs.squeeze().cpu().numpy())
            actuals.extend(actual_clicks.cpu().numpy())
    return predictions, actuals



def evaluate_model(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    predictions, actuals = [], []
    with torch.no_grad():
        for articles, user_features, article_stats, clicks in data_loader:
            outputs = model(articles, user_features, article_stats)
            predictions.extend(outputs.squeeze().cpu().numpy())
            actuals.extend(clicks.cpu().numpy())

    results = pd.DataFrame({
        'predicted_score': predictions,
        'clicked': actuals
    })
    ndcg_score = calculate_ndcg(results, 'predicted_score', 'clicked', k=10)
    return ndcg_score



In [None]:
def prepare_data(articles_embeddings, user_features, article_stats, clicks):
    articles_embeddings = torch.tensor(articles_embeddings, dtype=torch.float32)
    user_features = torch.tensor(user_features, dtype=torch.float32)
    article_stats = torch.tensor(article_stats, dtype=torch.float32)
    clicks = torch.tensor(clicks, dtype=torch.float32)
    return articles_embeddings, user_features, article_stats, clicks

In [None]:
from torch.utils.data import DataLoader, TensorDataset, random_split

#articles_embeddings, user_features, article_stats, clicks i ASUSME WE HAVE THOSE DATA INT THE LATS DATASET 
articles, users, stats, clicks = prepare_data(articles_embeddings, user_features, article_stats, clicks)
# Convert datasets to TensorDataset
full_dataset = TensorDataset(articles, users, stats, clicks)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = random_split(full_dataset, [train_size, test_size])

train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)


In [None]:
# Model instantiation
# Initialize the model
model = NPAModel(num_words=10000, embedding_dim=100, num_filters=128, kernel_size=3, num_user_features=300, num_article_stats=10)
if torch.cuda.is_available():
    model = model.cuda()  # Move model to GPU if CUDA is available

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(model, train_loader, optimizer, criterion, num_epochs=10)

# Evaluate NDCG on test set
ndcg_score = evaluate_model(model, test_loader)
print(f"Test NDCG Score: {ndcg_score}")
