In [3]:
from google.colab import drive
drive.mount('/content/drive')


import os
os.chdir('/content/drive/MyDrive/BT4222/')
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [4]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
import pyarrow.parquet as pq
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string
from sklearn.model_selection import train_test_split
from collections import Counter
from gensim.models.fasttext import load_facebook_vectors  # For using embeddings only


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')  # for POS tagging, necessary for better lemmatization
nltk.download('stopwords')


stop_words = set(stopwords.words('danish'))
lemmatizer = WordNetLemmatizer()
model = load_facebook_vectors('cc.da.300.bin')
dataset_type = "ebnerd_demo"
base_path = os.path.join(".", dataset_type)

text_column = 'title'
articles_columns = ["article_id", "title"]
all_articles = pd.read_parquet('articles.parquet')[articles_columns]


cols_hist = ['user_id', 'article_id_fixed']

history_articles_train = pd.read_parquet('train/history.parquet')[cols_hist]
history_articles_val = pd.read_parquet('validation/history.parquet')[cols_hist]
history_articles_train.rename(columns={'article_id_fixed' : 'article_id'}, inplace=True)
history_articles_val.rename(columns={'article_id_fixed': 'article_id'}, inplace=True)
history_articles = pd.concat([history_articles_train, history_articles_val], ignore_index=True)


def preprocess_behaviors(behaviors_ds):
    selected_columns = ['impression_id', 'article_id', 'user_id', 'is_sso_user', 'is_subscriber', 'age', 'gender', 'device_type']
    filtered_data = behaviors_ds[selected_columns]

    # Convert columns to numerical values
    filtered_data['article_id'] = pd.to_numeric(filtered_data['article_id'].fillna(0), downcast='integer')
    filtered_data['age'] = pd.to_numeric(filtered_data['age'].fillna(0), downcast='integer')
    filtered_data['gender'] = pd.to_numeric(filtered_data['gender'].fillna(-1), downcast='integer')

    # One-hot encoding for device_type and gender
    device_one_hot = pd.get_dummies(filtered_data['device_type'], prefix='device_type')
    filtered_data = pd.concat([filtered_data, device_one_hot], axis=1).drop(columns=['device_type'])

    gender_one_hot = pd.get_dummies(filtered_data['gender'], prefix='gender')
    filtered_data = pd.concat([filtered_data, gender_one_hot], axis=1).drop(columns=['gender'])

    # Convert boolean columns to integers
    filtered_data['is_sso_user'] = filtered_data['is_sso_user'].astype(int)
    filtered_data['is_subscriber'] = filtered_data['is_subscriber'].astype(int)

    return filtered_data


# Define an Embedding Model in PyTorch
class ImpressionEmbeddingModel(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super(ImpressionEmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)

    def forward(self, x):
        return self.embedding(x)

# Custom Dataset for impression IDs
class ImpressionDataset(Dataset):
    def __init__(self, impression_ids):
        self.impression_ids = impression_ids

    def __len__(self):
        return len(self.impression_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.impression_ids[idx], dtype=torch.long)

# Function to create embeddings
def create_impression_embeddings_df(filtered_data, embedding_dim=8, batch_size=512):
    # Step 1: Create a mapping for impression_id to continuous indices
    unique_impressions = filtered_data[['impression_id', 'user_id']].drop_duplicates()
    unique_impressions.reset_index(drop=True, inplace=True)
    impression_mapping = {orig_id: new_id for new_id, orig_id in enumerate(unique_impressions['impression_id'])}

    # Map impression IDs in filtered_data
    filtered_data['mapped_impression_id'] = filtered_data['impression_id'].map(impression_mapping)

    # Define model
    input_dim = len(impression_mapping)  # Number of unique impressions
    model = ImpressionEmbeddingModel(input_dim=input_dim, embedding_dim=embedding_dim)
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()  # Set model to evaluation mode

    # Create Dataset and DataLoader for batch processing
    impression_dataset = ImpressionDataset(impression_ids=list(impression_mapping.values()))
    impression_loader = DataLoader(impression_dataset, batch_size=batch_size, shuffle=False)

    # Collect embeddings
    embeddings_list = []
    with torch.no_grad():
        for batch in impression_loader:
            batch = batch.to('cuda' if torch.cuda.is_available() else 'cpu')
            embeddings = model(batch)
            embeddings_list.extend(embeddings.cpu().numpy())  # Move to CPU for storing in a list

   #create a DataFrame with the original impression IDs and embeddings
    embeddings_df = pd.DataFrame({
        'impression_id': unique_impressions['impression_id'],
        'user_id': unique_impressions['user_id'],
        'user_embeddings': [embedding.tolist() for embedding in embeddings_list]
    })

    return embeddings_df


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)


def clean_and_tokenize(title):
    # Remove punctuation from title
    title_cleaned = title.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(title_cleaned.lower())
    words_lemmatized = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words]
    return [w for w in words_lemmatized if not w in stop_words]



from collections import Counter
def build_vocabs(titles):
  all_words = [word for title in titles for word in title]
  word_count = Counter(all_words)
  vocabulary = {word: idx + 1 for idx, (word, count) in enumerate(word_count.items())}
  vocabulary['<UNK>'] = 0
  return vocabulary






def get_embedding_sequence(title, model):
    embedding_sequence = []
    for word in title:  # Assuming 'title' is a list of words
        if word in model:  # Check if the word is in the model
            embedding_sequence.append(model[word])
        else:
            embedding_sequence.append(np.zeros(model.vector_size))  # Zero vector for unknown words

    if len(embedding_sequence) == 0:
        print("Entirely empty sequence, creating a fully empty sequence")
        # Entirely empty sequence, create a single zero vector (or empty array if desired)
        embedding_sequence = np.zeros((1, model.vector_size))
    else:
        embedding_sequence = np.array(embedding_sequence)

    print("Embedding sequence shape:", embedding_sequence.shape)
    return embedding_sequence




vocabulary = build_vocabs(all_articles[text_column])  #vocabulary 15432
all_articles[text_column] = all_articles[text_column].apply(clean_and_tokenize)
# Apply the function to each row in the DataFrame to create sequences
all_articles['title_embedding_sequence'] = all_articles['title'].apply(lambda x: get_embedding_sequence(x, model))
articles_embeddings_dict = dict(zip(all_articles['article_id'], all_articles['title_embedding_sequence']))




filePath_with_all_features = r"/content/drive/MyDrive/BT4222/xgboost_dataset_ebnerd_demo.parquet"
data = pd.read_parquet(filePath_with_all_features)
print(data.columns.values)


# Load the original behavior data
original_behaviors_train = pd.read_parquet(r"/content/drive/MyDrive/BT4222/train/behaviors.parquet")
original_behaviors_valid = pd.read_parquet(r"/content/drive/MyDrive/BT4222/validation/behaviors.parquet")

# Preprocess behaviors data
preprocessed_behaviors_train = preprocess_behaviors(original_behaviors_train)
preprocessed_behaviors_valid = preprocess_behaviors(original_behaviors_valid)

# Define the embedding dimension
embedding_dim = 8

# Generate impression embeddings for the training and validation datasets
embeddings_df_train = create_impression_embeddings_df(preprocessed_behaviors_train, embedding_dim)
embeddings_df_valid = create_impression_embeddings_df(preprocessed_behaviors_valid, embedding_dim)

# Concatenate training and validation embeddings into one DataFrame
embeddings_user = pd.concat([embeddings_df_train, embeddings_df_valid], axis=0).reset_index(drop=True)

# Now `embeddings_user` contains both training and validation impression embeddings
print(embeddings_user.head())

data.reset_index(drop=True, inplace=True)
embeddings_user.reset_index(drop=True, inplace=True)

impression_user_data_cols = ['impression_id', 'user_embeddings', 'user_id']
candidate_news_data_cols = ['impression_id', 'article_id', 'clicked','user_id']
candidate_news_data = data[candidate_news_data_cols]
data = pd.merge(candidate_news_data, embeddings_user, on=['impression_id','user_id'])
data['clicked'] = data['clicked'].astype(int)

impression_ids = data['impression_id'].unique()

# Convert columns to numeric, coercing errors to NaN
data['impression_id'] = pd.to_numeric(data['impression_id'], errors='coerce')
data['article_id'] = pd.to_numeric(data['article_id'], errors='coerce')
data['clicked'] = pd.to_numeric(data['clicked'], errors='coerce')
data['user_id'] = pd.to_numeric(data['user_id'], errors='coerce')

# Drop rows with NaN values (which were non-numeric initially)
data.dropna(inplace=True)

# Convert to integers if needed
data['impression_id'] = data['impression_id'].astype(int)
data['article_id'] =data['article_id'].astype(int)
data['clicked'] = data['clicked'].astype(int)
data['user_id'] = data['user_id'].astype(int)

# Verify data types and output a sample
print(data.dtypes)
print(data.head())

# Split impression IDs into training and temporary sets (temporary will be split into validation and test)
train_ids, temp_ids = train_test_split(impression_ids, test_size=0.4, random_state=42)  # 60% train, 40% temp
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)  # Split temp into 20% val, 20% test
# Split the actual data based on IDs
data.reset_index(inplace=True)
train_data = data[data['impression_id'].isin(train_ids)]
val_data = data[data['impression_id'].isin(val_ids)]
test_data = data[data['impression_id'].isin(test_ids)]
history_articles.rename(columns={'article_id' : 'clicked_article_ids'},inplace=True)
impression_user_data_train, candidate_news_data_train = train_data[impression_user_data_cols], train_data[candidate_news_data_cols]
impression_user_data_val, candidate_news_data_val = val_data[impression_user_data_cols], val_data[candidate_news_data_cols]
impression_user_data_test, candidate_news_data_test = test_data[impression_user_data_cols], test_data[candidate_news_data_cols]





DATASET

In [6]:
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class NPA_Dataset(Dataset):
    def __init__(self, historical_clicks, impression_user_data, candidate_news_data, article_embedding_dict, embedding_dim=300):
        super(NPA_Dataset, self).__init__()
        self.impression_user_data = impression_user_data.set_index('impression_id', drop=False)
        self.candidate_news_data = candidate_news_data.set_index('impression_id', drop=False)
        self.historical_clicks = historical_clicks
        self.article_embedding_dict = article_embedding_dict
        self.default_embedding = np.zeros(embedding_dim)
        self.embedding_dim = embedding_dim

    def __len__(self):
        return len(self.impression_user_data)

    def __getitem__(self, idx):
        # Retrieve information for the current impression
        impression_info = self.impression_user_data.iloc[idx]
        impression_id = impression_info.name
        user_id = impression_info['user_id']

        # User features
        user_features = torch.tensor(impression_info['user_embeddings'], dtype=torch.float32)

        # Candidate news embeddings
        candidate_articles = self.candidate_news_data.loc[impression_id]
        candidate_news_embeddings = [
            torch.tensor(self.article_embedding_dict.get(article_id, self.default_embedding), dtype=torch.float32)
            for article_id in candidate_articles['article_id'].tolist()
        ]

        # Historical news embeddings
        historical_article_ids = self.historical_clicks.loc[self.historical_clicks['user_id'] == user_id, 'clicked_article_ids'].iloc[0]
        historical_news_embeddings = [
            torch.tensor(self.article_embedding_dict.get(article_id, self.default_embedding), dtype=torch.float32)
            for article_id in historical_article_ids
        ]

        # Labels
        labels = torch.tensor(candidate_articles['clicked'].values, dtype=torch.float32)

        return (candidate_news_embeddings, historical_news_embeddings, user_features), labels


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class UserPreferenceQuery(nn.Module):
    def __init__(self, num_filters, query_dim_d, embedding_dim):
        super(UserPreferenceQuery, self).__init__()
        # Ensure query_dim_w matches num_filters from NewsEncoder
        self.Vw = nn.Linear(embedding_dim, num_filters, bias=False)  # Match num_filters
        self.vw = nn.Parameter(torch.zeros(num_filters))  # Learnable parameter for word-level query

        # Document-level query dimension should match query_dim_d expected in UserEncoder
        self.Vd = nn.Linear(embedding_dim, query_dim_d, bias=False)
        self.vd = nn.Parameter(torch.zeros(query_dim_d))  # Learnable parameter for document-level query

    def forward(self, user_embedding):
        # Project user embedding to obtain word and document queries
        qd = F.relu(self.Vd(user_embedding) + self.vd)  # Document-level query
        qw = F.relu(self.Vw(user_embedding) + self.vw)  # Word-level query
        return qw, qd



class NewsEncoder(nn.Module):
    def __init__(self, num_filters, filter_size, embedding_dim, query_dim):
        super(NewsEncoder, self).__init__()
        self.conv = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=filter_size, padding='same')
        self.query_projection = nn.Linear(query_dim, num_filters)  # Project `qw` to match `num_filters`
        self.score_projection = nn.Linear(num_filters, 1, bias=False)  # Projection layer for attention scoring
        self.num_filters =num_filters

    def forward(self, inputs, query_vector, mask=None):
        batch_size, num_articles, seq_len, embedding_dim = inputs.shape

        # Reshape inputs for Conv1d: [batch_size * num_articles, embedding_dim, seq_len]
        inputs = inputs.view(batch_size * num_articles, embedding_dim, seq_len)

        # Apply Conv1d and activation
        x = F.relu(self.conv(inputs))  # Shape: [batch_size * num_articles, num_filters, seq_len]
        x = x.transpose(1, 2)  # Shape: [batch_size * num_articles, seq_len, num_filters]

        # Adjust `query_vector` shape to match number of articles
        query_vector_projected = self.query_projection(query_vector)
        query_vector_projected = query_vector_projected.view(batch_size, 1, self.num_filters).expand(batch_size, num_articles, self.num_filters)
        query_vector_projected = query_vector_projected.reshape(batch_size * num_articles, 1, self.num_filters)

        # Calculate attention scores
        attention_input = torch.tanh(x + query_vector_projected)  # Shape: [batch_size * num_articles, seq_len, num_filters]
        attention_scores = torch.softmax(self.score_projection(attention_input), dim=1)

        # Apply mask if provided
        if mask is not None:
            try:
                # Reshape `mask` to [batch_size * num_articles, seq_len, 1]
                mask = mask.view(batch_size * num_articles, seq_len, 1)
                attention_scores = attention_scores * mask
                attention_scores = attention_scores / (attention_scores.sum(dim=1, keepdim=True) + 1e-9)
            except RuntimeError as e:
                print(f"Mask shape mismatch: {mask.shape}. Expected [{batch_size * num_articles}, {seq_len}, 1].")
                raise e

        # Compute context vector as a weighted sum
        context_vector = torch.sum(x * attention_scores, dim=1)  # Shape: [batch_size * num_articles, num_filters]
        context_vector = context_vector.view(batch_size, num_articles, -1)

        return context_vector


class UserEncoder(nn.Module):
    def __init__(self, query_dim_d, num_filters):
        super(UserEncoder, self).__init__()
        self.query_projection = nn.Linear(query_dim_d, num_filters, bias=False)  # Project `qd` to match `num_filters`

    def forward(self, news_encoder_outputs, query_vector, mask=None):
        batch_size, num_clicked_news, num_filters = news_encoder_outputs.shape

        # Project `query_vector` (`qd`) to match `num_filters`
        query_vector_projected = self.query_projection(query_vector)  # Shape: [batch_size, num_filters]

        # Expand `query_vector_projected` across `num_clicked_news` to match `news_encoder_outputs`
        query_vector_expanded = query_vector_projected.unsqueeze(1).expand(batch_size, num_clicked_news, num_filters)  # Shape: [batch_size, num_clicked_news, num_filters]

        # Calculate attention scores
        scores = torch.tanh(news_encoder_outputs + query_vector_expanded).sum(dim=-1)  # Shape: [batch_size, num_clicked_news]
        attention_scores = torch.softmax(scores, dim=1)  # Softmax over clicked news items

        # Apply mask at the article level
        if mask is not None:
            # Ensure mask has correct shape: [batch_size, num_clicked_news]
            mask = mask.any(dim=-1)  # Aggregate mask across the word sequence to get article-level mask
            attention_scores = attention_scores * mask
            attention_scores = attention_scores / (attention_scores.sum(dim=1, keepdim=True) + 1e-9)  # Re-normalize

        # Weighted sum to get user profile vector
        user_profile_vector = torch.sum(attention_scores.unsqueeze(-1) * news_encoder_outputs, dim=1)  # Shape: [batch_size, num_filters]

        return user_profile_vector



class ClickPredictor(nn.Module):
    def forward(self, candidate_news_vectors, user_vector, mask=None):
        # Ensure `user_vector` is 2D to match `candidate_news_vectors`
        user_vector = user_vector.squeeze(1)  # Shape: [batch_size, num_filters]

        # Expand `user_vector` to match `candidate_news_vectors` dimensions
        user_vector_expanded = user_vector.unsqueeze(1).expand_as(candidate_news_vectors)  # Shape: [batch_size, num_candidates, num_filters]

        # Compute logits (inner product across the last dimension)
        logits = torch.sum(candidate_news_vectors * user_vector_expanded, dim=-1)  # Shape: [batch_size, num_candidates]

        # Apply mask if provided and in correct shape
        if mask is not None:
            mask = mask.any(dim=-1)  # Ensure `mask` is reduced to [batch_size, num_candidates]
            if mask.shape != logits.shape:
                raise RuntimeError(f"Mask shape {mask.shape} does not match logits shape {logits.shape}")
            logits = logits.masked_fill(~mask, float('-inf'))  # Apply mask

        # Apply sigmoid to get probabilities
        probabilities = torch.sigmoid(logits)  # Shape: [batch_size, num_candidates]

        return probabilities







class NPA_Model(nn.Module):
    def __init__(self, num_filters, filter_size, query_dim_w=8, query_dim_d=8, embeddings_user_dim=8, embeddings_news_dim=300):
        super(NPA_Model, self).__init__()
        self.user_preference_query = UserPreferenceQuery(query_dim_w, query_dim_d, embeddings_user_dim)
        self.news_encoder = NewsEncoder(num_filters=num_filters, filter_size=filter_size, embedding_dim=embeddings_news_dim, query_dim=8)
        self.user_encoder = UserEncoder(query_dim_d, num_filters)
        self.click_predictor = ClickPredictor()

    def forward(self, candidate_news, clicked_news, user_embeddings, candidate_mask=None, clicked_mask=None):
        qw, qd = self.user_preference_query(user_embeddings)

        candidate_news_vectors = self.news_encoder(candidate_news, qw, mask=candidate_mask)
        clicked_news_vectors = self.news_encoder(clicked_news, qw, mask=clicked_mask)

        user_profile_vector = self.user_encoder(clicked_news_vectors, qd, mask=clicked_mask)
        probabilities = self.click_predictor(candidate_news_vectors, user_profile_vector, mask=candidate_mask)
        return probabilities


In [8]:
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    candidate_news_embeddings, historical_news_embeddings, user_features, candidate_mask, clicked_mask, labels = [], [], [], [], [], []

    # Determine maximum sequence length for candidates and clicked news in the batch
    max_seq_len_candidates = max(len(news) for (candidates, _, _), _ in batch for news in candidates)
    max_seq_len_clicked = max(len(news) for (_, clicked, _), _ in batch for news in clicked)

    for (candidates, clicked, user), label in batch:
        # Pad each candidate and historical news sequence to ensure consistent `seq_len`
        candidates_padded = [F.pad(news, (0, 0, 0, max_seq_len_candidates - news.size(0))) for news in candidates]
        clicked_padded = [F.pad(news, (0, 0, 0, max_seq_len_clicked - news.size(0))) for news in clicked]

        candidate_news_embeddings.append(torch.stack(candidates_padded))  # Shape: [num_candidates, max_seq_len, embedding_dim]
        historical_news_embeddings.append(torch.stack(clicked_padded))    # Shape: [num_clicked, max_seq_len, embedding_dim]

        user_features.append(user)

        # Create a mask where each article's `seq_len` is represented
        candidate_mask.append(torch.ones(len(candidates), max_seq_len_candidates, dtype=torch.bool))  # Shape: [num_candidates, max_seq_len]
        clicked_mask.append(torch.ones(len(clicked), max_seq_len_clicked, dtype=torch.bool))          # Shape: [num_clicked, max_seq_len]

        labels.append(label)

    # Pad embeddings along the 0th dimension
    candidate_news_embeddings = pad_sequence(candidate_news_embeddings, batch_first=True)  # Shape: [batch_size, max_num_candidates, max_seq_len, embedding_dim]
    historical_news_embeddings = pad_sequence(historical_news_embeddings, batch_first=True)  # Shape: [batch_size, max_num_clicked, max_seq_len, embedding_dim]

    # Pad candidate and clicked masks along the 0th dimension to maintain consistent number of articles and word sequence length
    candidate_mask = pad_sequence(candidate_mask, batch_first=True, padding_value=0)  # Shape: [batch_size, max_num_candidates, max_seq_len]
    clicked_mask = pad_sequence(clicked_mask, batch_first=True, padding_value=0)      # Shape: [batch_size, max_num_clicked, max_seq_len]

    user_features = torch.stack(user_features)
    labels = pad_sequence(labels, batch_first=True)

    return (candidate_news_embeddings, historical_news_embeddings, user_features, candidate_mask, clicked_mask), labels


In [9]:
import tensorflow as tf
import os

from torch.utils.data import DataLoader

# Initialize datasets
train_dataset = NPA_Dataset(history_articles, impression_user_data_train, candidate_news_data_train, articles_embeddings_dict)
val_dataset = NPA_Dataset(history_articles, impression_user_data_val, candidate_news_data_val, articles_embeddings_dict)
test_dataset = NPA_Dataset(history_articles, impression_user_data_test, candidate_news_data_test, articles_embeddings_dict)


#BAAAAAAAATCH
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn, num_workers=8,  pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)


# Initialize model
model = NPA_Model(num_filters=32, filter_size=16)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Define loss function and optimizer
criterion = torch.nn.BCELoss()  # Binary Cross-Entropy Loss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



torch.backends.cudnn.benchmark = True



def ndcg_at_k(y_true, y_pred, k=3):
    # Get top-k indices and gather corresponding labels from `y_true`
    _, top_k_indices = torch.topk(y_pred, k, dim=-1)
    top_k_labels = torch.gather(y_true, 1, top_k_indices).squeeze()

    # Calculate DCG
    gain = (2 ** top_k_labels - 1).float()
    discounts = torch.log2(torch.arange(k, dtype=torch.float32, device=y_pred.device) + 2)
    dcg = torch.sum(gain / discounts, dim=-1)

    # Calculate ideal DCG
    sorted_labels, _ = torch.sort(y_true, descending=True)
    ideal_gain = (2 ** sorted_labels[:, :k] - 1).float()
    idcg = torch.sum(ideal_gain / discounts, dim=-1)

    # Calculate nDCG
    ndcg = torch.where(idcg == 0, torch.zeros_like(dcg), dcg / idcg)
    return ndcg.mean()

#########################################

scaler = torch.cuda.amp.GradScaler()  # Initialize GradScaler for mixed precision

def train_mixed_precision(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    ndcg_score = 0
    counter_batch = 1
    for batch in dataloader:
        print(f'Starting batch number {counter_batch}')
        counter_batch += 1
        (candidate_news, clicked_news, user_features, candidate_mask, clicked_mask), labels = batch
        candidate_news = candidate_news.to(device)
        clicked_news = clicked_news.to(device)
        user_features = user_features.to(device)
        candidate_mask = candidate_mask.to(device)
        clicked_mask = clicked_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        # Mixed precision forward pass
        with torch.cuda.amp.autocast():
            outputs = model(candidate_news, clicked_news, user_features, candidate_mask, clicked_mask)
            loss = criterion(outputs, labels)

        total_loss += loss.item()
        print(f"Loss: {loss} total loss: {total_loss}")

        ndcg_score += ndcg_at_k(labels, outputs, k=3).item()
        print(f"Ncdg score: {ndcg_score}")

        # Mixed precision backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    return total_loss / len(dataloader), ndcg_score / len(dataloader)



#########################################


# Training loop remains the same
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    ndcg_score = 0
    counter_batch = 1
    for batch in dataloader:
        print(f'Starting batch number {counter_batch}')
        counter_batch += 1
        (candidate_news, clicked_news, user_features, candidate_mask, clicked_mask), labels = batch
        candidate_news = candidate_news.to(device)
        clicked_news = clicked_news.to(device)
        user_features = user_features.to(device)
        candidate_mask = candidate_mask.to(device)
        clicked_mask = clicked_mask.to(device)
        labels = labels.to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(candidate_news, clicked_news, user_features, candidate_mask, clicked_mask)

        # Compute loss
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        print(f"Loss: {loss} total loss: {total_loss}")

        # Compute nDCG@k metric
        ndcg_score += ndcg_at_k(labels, outputs, k=3).item()
        print(f"Ncdg score: {ndcg_score}")

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader),  ndcg_score / len(dataloader)



def test(model, dataloader, criterion, device, k=3):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    ndcg_score = 0

    with torch.no_grad():  # Disable gradient computation
        for batch in dataloader:
            (candidate_news, clicked_news, user_features, candidate_mask, clicked_mask), labels = batch
            candidate_news = candidate_news.to(device)
            clicked_news = clicked_news.to(device)
            user_features = user_features.to(device)
            candidate_mask = candidate_mask.to(device)
            clicked_mask = clicked_mask.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(candidate_news, clicked_news, user_features, candidate_mask, clicked_mask)

            # Compute loss
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Compute nDCG@k metric
            ndcg_score += ndcg_at_k(labels, outputs, k=3).item()

    avg_loss = total_loss / len(dataloader)
    avg_ndcg = ndcg_score / len(dataloader)

    return avg_loss, avg_ndcg




In [None]:
import torch
import os
import time
# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
torch.backends.cudnn.benchmark = True


# Number of epochs and early stopping patience
epochs = 10
patience = 3  # Number of epochs to wait for improvement before stopping
best_val_loss = float('inf')  # Initialize to a very high value
patience_counter = 0  # Counts epochs without improvement
# Format the time as needed, e.g., year-month-day_hour-minute
current_time = time.strftime("%Y%m%d_%H%M", time.localtime())
checkpoint_path = f"model_checkpoint_64batch_{current_time}.pth"  # Checkpoint file path

##TRyING MIXED PRECISION TO ccelea=rate trainiing
try:
    for epoch in range(epochs):
        # Training phase
        train_loss, train_ndcg = train(model, train_loader, criterion, optimizer, device)

        # Validation phase
        val_loss, val_ndcg = test(model, val_loader, criterion, device, k=5)

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val nDCG@5: {val_ndcg:.4f}")

        # Check for improvement
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0  # Reset patience counter
            # Save the best model as a checkpoint
            torch.save(model.state_dict(), checkpoint_path)
            print(f"Model improved, saving new checkpoint to {checkpoint_path}")
        else:
            patience_counter += 1  # Increment patience counter if no improvement

        # Early stopping if validation loss hasn't improved for 'patience' epochs
        if patience_counter >= patience:
            print("Early stopping triggered. No improvement in validation loss.")
            break

except KeyboardInterrupt:
    print("Training interrupted by user. Saving model checkpoint...")
    torch.save(model.state_dict(), "interrupted_checkpoint.pth")
    print("Checkpoint saved at interrupted_checkpoint.pth")

# Ensure model is saved at the end
final_path = f"final_model_64batch_{current_time}.pth"
torch.save(model.state_dict(), final_path)
print(f"Training complete. Final model saved at {final_path}")

# Load the best model and evaluate on the test set
model.load_state_dict(torch.load(checkpoint_path))  # Load the best model based on validation
test_loss, test_ndcg = test(model, test_loader, criterion, device, k=5)
print(f"Test Loss: {test_loss:.4f}, Test nDCG@5: {test_ndcg:.4f}")
