# LSTUR: Neural News Recommendation with Long- and Short-term User Representations
LSTUR \[1\] is a news recommendation approach capturing users' both long-term preferences and short-term interests. The core of LSTUR is a news encoder and a user encoder.  In the news encoder, we learn representations of news from their titles. In user encoder, we propose to learn long-term
user representations from the embeddings of their IDs. In addition, we propose to learn short-term user representations from their recently browsed news via GRU network. Besides, we propose two methods to combine
long-term and short-term user representations. The first one is using the long-term user representation to initialize the hidden state of the GRU network in short-term user representation. The second one is concatenating both
long- and short-term user representations as a unified user vector.

## Global settings and imports

In [1]:
import sys
import os
import numpy as np
import zipfile
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.lstur import LSTURModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))


System version: 3.9.18 (main, Sep 11 2023, 14:09:26) [MSC v.1916 64 bit (AMD64)]
Tensorflow version: 2.14.0


In [2]:
import pandas as pd
import MMR
import evaluation

## Download and load data

In [4]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name
# Options: demo, small, large
MIND_type = 'demo'

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'lstur.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

100%|██████████| 17.0k/17.0k [00:01<00:00, 9.21kKB/s]
100%|██████████| 9.84k/9.84k [00:01<00:00, 5.85kKB/s]
100%|██████████| 95.0k/95.0k [00:27<00:00, 3.47kKB/s]


## Train the model

In [10]:
epochs = 5
seed = 40
batch_size = 32

def train_model():
    hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs)
    print(hparams)
    model = LSTURModel(hparams, MINDIterator, seed=seed)
    print(model.run_eval(valid_news_file, valid_behaviors_file))
    model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)
    return model


In [11]:
model = train_model()

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 4, 'head_dim': 100, 'filter_num': 400, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 5, 'batch_size': 32, 'show_step': 100000, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'cnn_activation': 'relu', 'model_type': 'lstur', 'loss': 'cross_entropy_loss', 'wordEmb_file': 'C:\\Users\\Eliza\\AppData\\Local\\Temp\\tmpjylb0wsv\\utils\\embedding.npy', 'wordDict_file': 'C:\\Users\\Eliza\\AppData\\Local\\Temp\\tmpjylb0wsv\\utils\\word_dict.pkl', 'userDict_file': 'C:\\Users\\Eliza\\AppData\\Local\\Temp\\tmpjylb0wsv\\utils\\uid2index.pkl'}
Tensor("conv1d/Relu:0", shape=(None, 30, 400), dtype=float32)
Tensor("att_layer2/Sum_1:0", shape=(None, 400), dtype=float32)


  super().__init__(name, **kwargs)
  updates=self.state_updates,
586it [00:08, 72.63it/s] 
4it [00:02,  1.87it/s]

## Evaluate the model

In [6]:
def eval_model(model):
    res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
    print(res_syn)  


## Save the model

In [7]:
def save_model(model, data_path):
    model_path = os.path.join(data_path, "model")
    os.makedirs(model_path, exist_ok=True)
    model.model.save_weights(os.path.join(model_path, "lstur_ckpt"))

In [None]:
def write_predictions(model, data_path):
    group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)
    with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()
        pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'
        f.write(' '.join([str(impr_index), pred_rank])+ '\n')

    f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)
    f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')
    f.close()

## Construct dataframes

In [8]:
def get_df():
    user_ids = []
    news_rec_lists = []
    pred_prob = []
    i = 0
    group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)
    with open(valid_behaviors_file, 'r') as rd:
            impr_index = 0
            for line in rd:
                uid, time, history, impr = line.strip("\n").split('\t')[-4:]

                impr_news = [i.split("-")[0] for i in impr.split()]
                user_ids.append(uid)
                news_rec_lists.append(impr_news)
                pred_prob.append(group_preds[i])
                i+=1
    user_rec_df = pd.DataFrame({'user_id' : user_ids, 'news_id': news_rec_lists, 'pred' : pred_prob})
    user_rec_df

In [None]:
def normalize_array(arr):
    return (arr - np.min(arr)) / (np.max(arr) - np.min(arr))
    
def normalize_df_for_mmr(df):
    df_normalized = df.copy()
    df_normalized['pred'] = df_normalized['pred'].apply(lambda x: normalize_array(x))
    return df_normalized

# Results

### Baseline

In [None]:
df = get_df()
news_df = MMR.get_news_df()
glove = MMR.load_glove()
k = 5

df_at_k = df.copy()
df_at_k["news_id"] = df["news_id"].apply(lambda x: x[:k])

print(f"NDCG@{k} (baseline): {evaluation.calculate_ndcg_at_k(df_at_k, k)}")
print(f"Diversity (baseline): {evaluation.diversity_eval(glove, news_df, df_at_k)}")

### After re-ranking via MMR
Note that lamda = 0 means all diversity, no relevance, and lamda = 1 means all relevance, no diversity.

In [None]:
lamdas = [x/100.0 for x in range(0, 125, 25)]
diversities = []
ndcgs = []
exploded_df = df.copy().reset_index().explode(['pred', 'label', 'news_id']) #split back into columns
normalized_df = normalize_df_for_mmr(df)
for i in lamdas:
    print(f"\nReranking with lambda={i}...")
    mmr_rerank_data = MMR.mmr_all(glove, news_df, normalized_df, i, k)
    mmr_rerank_df = pd.DataFrame.from_dict(mmr_rerank_data, orient="index").reset_index()

    diversity = evaluation.diversity_eval(glove, news_df, mmr_rerank_df)
    print(f"Diversity: {diversity}")
    diversities.append(diversity)

    mmr_rerank_df = mmr_rerank_df.rename({"index": "user_id"}, axis=1)
    split_df = mmr_rerank_df.set_index(["user_id"]).apply(lambda x: x.explode()).reset_index()
    split_df = split_df.rename({"pred": "mmr_pred"}, axis=1)
    mmr_labels = pd.merge(exploded_df, split_df, on=["user_id", "news_id"], how="right")
    mmr_labels_lists = mmr_labels.groupby("user_id").agg({"label": list, "mmr_pred": list})
    mmr_labels_lists.rename(columns={"mmr_pred": "pred"}, inplace=True)
    ndcg = evaluation.calculate_ndcg_at_k(mmr_labels_lists, k)
    print(f"NDCG@{k}: {ndcg}")
    ndcgs.append(ndcg)

In [None]:
evaluation.graph_ndcg(ndcgs, lamdas, k)

In [None]:
evaluation.graph_diversity(diversities, lamdas)