# CICERO -- SUBER with NRMS as the model to be trained

How can we take the NRMS model and use it as input for SUBER?  By that I mean, how it provides the data in the form that the original NRMS model uses 

The NRMS  data comes out of the recommenders module itself.  After one does a pip install recommenders. 

## Add the data from NRMS first using the recommenders module




In [1]:
import time

# Start the timer
start_time = time.time()

# Remove warnings
import os
os.environ['TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import os
import sys
import numpy as np
import zipfile
from tqdm import tqdm
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.nrms import NRMSModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]
Tensorflow version: 2.15.1


In [2]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

# List available devices
print("Available devices:")
for device in tf.config.list_physical_devices():
    print(device)

# Check if a GPU is detected
if tf.config.list_physical_devices('GPU'):
    print("GPU is available and TensorFlow is using it.")
else:
    print("GPU is NOT available. TensorFlow is using the CPU.")


TensorFlow version: 2.15.1
Available devices:
PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')
PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
GPU is available and TensorFlow is using it.


## Prepare Parameters (will need to do something else for SUBER

In [None]:
epochs = 5
seed = 42
batch_size = 64

# Options: demo, small, large
# My modification for MINDSMALL as I mount in the data into the container.  Further the small dataset is not accessable anymore.


MIND_type = 'MINDsmall'

## Download and load the data


In [None]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name
data_path = "/app/SUBERX/datasets/" + MIND_type

# Create the directory if it doesn't exist
os.makedirs(data_path, exist_ok=True)

print(f"Data Path is {data_path}")

## Download Glove embeddings

The original NRMS used glove embeddings so will we.

In [None]:
from recommenders.datasets.mind import download_and_extract_glove

# Download and extract GloVe embeddings
glove_dir = '/app/SUBERX/glove_embeddings'
download_and_extract_glove(glove_dir)
print(f"GloVe embeddings extracted to: {glove_dir}")


### function to generate the UID to Index

In [None]:
def generate_uid2index(behaviors_file, output_file="uid2index.pkl"):
    """
    Generate uid2index.pkl mapping user_id to integer indices from behaviors.tsv.
    
    Args:
        behaviors_file (str): Path to the behaviors.tsv file.
        output_file (str): Path to save the uid2index.pkl file.
    
    Returns:
        dict: Mapping of user_id to index.
    """
    # Load behaviors.tsv
    columns = ["impression_id", "user_id", "time", "history", "impressions"]
    behaviors_df = pd.read_csv(behaviors_file, sep="\t", names=columns)

    # Create a mapping of user_id to index
    user_ids = behaviors_df["user_id"].dropna().unique()  # Drop NaN and get unique users
    uid2index = {user_id: idx for idx, user_id in enumerate(user_ids)}

    # Save as uid2index.pkl
    with open(output_file, "wb") as f:
        pickle.dump(uid2index, f)

    print(f"Created {output_file} with {len(uid2index)} users.")
    return uid2index

In [None]:
### Function to load Glove Embeddings

In [None]:
def load_glove_embeddings(glove_file, embedding_dim=300):
    """
    Load GloVe embeddings into a dictionary.
    
    Args:
        glove_file (str): Path to the GloVe file.
        embedding_dim (int): Dimension of GloVe vectors.
    
    Returns:
        dict: Mapping of words to embedding vectors.
    """
    embeddings_index = {}
    with open(glove_file, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Loading GloVe embeddings"):
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index



### Function to create word_dict and embeddings



In [None]:
def create_word_dict_and_embeddings(news_file, glove_embeddings, embedding_dim=300, output_dir="."):
    """
    Create word_dict.pkl and embedding.npy using GloVe embeddings.

    Args:
        news_file (str): Path to news.tsv.
        glove_embeddings (dict): Loaded GloVe embeddings.
        embedding_dim (int): Dimension of GloVe vectors.
        output_dir (str): Directory to save outputs.
    
    Returns:
        dict, np.ndarray: word_dict and embedding_matrix.
    """
    # Load news data
    news_df = pd.read_csv(news_file, sep="\t", names=["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"])

    # Tokenize titles and abstracts
    def tokenize(text):
        return word_tokenize(text.lower())  # Use the recommenders utility function

    all_text = news_df["title"].fillna("") + " " + news_df["abstract"].fillna("")
    tokens = []
    for text in all_text:
        tokens.extend(tokenize(text))
    
    # Count word frequencies and create word_dict
    word_counter = Counter(tokens)
    word_dict = {word: idx for idx, (word, _) in enumerate(word_counter.items(), start=1)}  # Start index at 1
    
    # Create embedding matrix
    embedding_matrix = np.zeros((len(word_dict) + 1, embedding_dim))  # Extra row for padding (index 0)
    for word, idx in word_dict.items():
        if word in glove_embeddings:
            embedding_matrix[idx] = glove_embeddings[word]
        else:
            embedding_matrix[idx] = np.random.normal(size=(embedding_dim,))  # Random vector for unknown words
    
    # Save word_dict and embedding matrix
    word_dict_file = os.path.join(output_dir, "word_dict.pkl")
    embedding_file = os.path.join(output_dir, "embedding.npy")
    with open(word_dict_file, "wb") as f:
        pickle.dump(word_dict, f)
    np.save(embedding_file, embedding_matrix)

    print(f"Saved word_dict.pkl and embedding.npy to {output_dir}.")
    return word_dict, embedding_matrix



## Identify the news and behavior files

In [None]:
# Generate uid2index.pkl
uid2index = generate_uid2index(behaviors_file, output_file="uid2index.pkl")

In [None]:
# Path to GloVe file (e.g., glove.6B.300d.txt)
glove_file = os.path.join(glove_dir, "glove/glove.6B.300d.txt")
embedding_dim = 300
glove_embeddings = load_glove_embeddings(glove_file, embedding_dim=embedding_dim)


In [None]:
# Generate word_dict.pkl and embedding.npy
word_dict, embedding_matrix = create_word_dict_and_embeddings(
    news_file="path/to/news.tsv",
    glove_embeddings=glove_embeddings,
    embedding_dim=embedding_dim,
    output_dir="."
)


In [None]:

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'nrms.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

## Train the NRMS Model


In [None]:
iterator = MINDIterator

In [None]:
iterator

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


In [None]:
!env

In [None]:
!env |grep PATH

In [None]:
!env |grep LD_L