# News Recommendation using Deep Learning

## Global settings and imports

In [1]:
!pip install scrapbook
!pip install recommenders
!pip install scrapbook
!pip install recommenders
!pip install tensorflow
!pip install zipfile
!pip install tqdm
!pip install tempfile

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scrapbook
  Downloading scrapbook-0.5.0-py3-none-any.whl (34 kB)
Collecting papermill
  Downloading papermill-2.3.4-py3-none-any.whl (37 kB)
Collecting ansiwrap
  Downloading ansiwrap-0.8.4-py2.py3-none-any.whl (8.5 kB)
Collecting jupyter-client>=6.1.5
  Downloading jupyter_client-7.3.4-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 64.8 MB/s 
Collecting traitlets>=4.2
  Downloading traitlets-5.3.0-py3-none-any.whl (106 kB)
[K     |████████████████████████████████| 106 kB 70.0 MB/s 
Collecting tornado>=6.0
  Downloading tornado-6.1-cp37-cp37m-manylinux2010_x86_64.whl (428 kB)
[K     |████████████████████████████████| 428 kB 3.3 MB/s 
Collecting textwrap3>=0.9.2
  Downloading textwrap3-0.9.2-py2.py3-none-any.whl (12 kB)
Installing collected packages: traitlets, tornado, textwrap3, jupyter-client, ansiwrap, papermill, scrapbook
  Attempting uninstal

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting recommenders
  Downloading recommenders-1.1.0-py3-none-manylinux1_x86_64.whl (335 kB)
[K     |████████████████████████████████| 335 kB 34.5 MB/s 
[?25hCollecting memory-profiler<1,>=0.54.0
  Downloading memory_profiler-0.60.0.tar.gz (38 kB)
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Collecting cornac<2,>=1.1.2
  Downloading cornac-1.14.2-cp37-cp37m-manylinux1_x86_64.whl (12.4 MB)
[K     |████████████████████████████████| 12.4 MB 27.3 MB/s 
Collecting lightfm<2,>=1.15
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 68.6 MB/s 
[?25hCollecting pandera[strategies]>=0.6.5
  Downloading pandera-0.9.0-py3-none-any.whl (197 kB)
[K     |████████████████████████████████| 197 kB 66.3 MB/s 
Collecting scikit-surprise>=1.0.6
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████

In [2]:
import sys
import os
import numpy as np
import zipfile
from tqdm import tqdm
import scrapbook as sb
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.nrms import NRMSModel
from recommenders.models.newsrec.io.mind_iterator import MINDIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))


System version: 3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]
Tensorflow version: 2.8.2


## Prepare parameters

In [3]:
epochs = 10
seed = 1
batch_size = 64

# Options: demo, small, large
MIND_type = 'small'

## Download and load data

In [5]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name

train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'nrms.yaml')

mind_url, mind_train_dataset, mind_dev_dataset, mind_utils = get_mind_data_set(MIND_type)

if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)
if not os.path.exists(yaml_file):
    download_deeprec_resources(r'https://recodatasets.z20.web.core.windows.net/newsrec/', \
                               os.path.join(data_path, 'utils'), mind_utils)

100%|██████████| 51.7k/51.7k [00:09<00:00, 5.74kKB/s]
100%|██████████| 30.2k/30.2k [00:35<00:00, 848KB/s]
100%|██████████| 152k/152k [00:21<00:00, 6.92kKB/s]


## Create hyper-parameters

In [6]:
hparams = prepare_hparams(yaml_file, 
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          batch_size=batch_size,
                          epochs=epochs,
                          show_step=10)
print(hparams)

HParams object with values {'support_quick_scoring': True, 'dropout': 0.2, 'attention_hidden_dim': 200, 'head_num': 20, 'head_dim': 20, 'filter_num': 200, 'window_size': 3, 'vert_emb_dim': 100, 'subvert_emb_dim': 100, 'gru_unit': 400, 'type': 'ini', 'user_emb_dim': 50, 'learning_rate': 0.0001, 'optimizer': 'adam', 'epochs': 10, 'batch_size': 64, 'show_step': 10, 'title_size': 30, 'his_size': 50, 'data_format': 'news', 'npratio': 4, 'metrics': ['group_auc', 'mean_mrr', 'ndcg@5;10'], 'word_emb_dim': 300, 'model_type': 'nrms', 'loss': 'cross_entropy_loss', 'wordEmb_file': '/tmp/tmpeo313ipt/utils/embedding.npy', 'wordDict_file': '/tmp/tmpeo313ipt/utils/word_dict.pkl', 'userDict_file': '/tmp/tmpeo313ipt/utils/uid2index.pkl'}


## Train the NRMS model

In [7]:
iterator = MINDIterator

In [8]:
model = NRMSModel(hparams, iterator, seed=seed)

  super(Adam, self).__init__(name, **kwargs)


In [9]:
print(model.run_eval(valid_news_file, valid_behaviors_file))

  updates=self.state_updates,
663it [00:04, 140.60it/s]
1143it [01:16, 14.94it/s]
73152it [00:08, 8413.92it/s]


{'group_auc': 0.4926, 'mean_mrr': 0.2101, 'ndcg@5': 0.2115, 'ndcg@10': 0.2736}


In [10]:
%%time
model.fit(train_news_file, train_behaviors_file, valid_news_file, valid_behaviors_file)

step 3690 , total_loss: 1.4105, data_loss: 1.2759: : 3693it [19:43,  3.12it/s]
663it [00:01, 533.22it/s]
1143it [01:17, 14.75it/s]
73152it [00:08, 8485.44it/s]


at epoch 1
train info: logloss loss:1.4104488518642857
eval info: group_auc:0.6298, mean_mrr:0.2904, ndcg@10:0.381, ndcg@5:0.3151
at epoch 1 , train time: 1183.7 eval time: 147.6


step 3690 , total_loss: 1.3084, data_loss: 1.4897: : 3693it [19:50,  3.10it/s]
663it [00:01, 537.28it/s]
1143it [01:17, 14.77it/s]
73152it [00:08, 8508.10it/s]


at epoch 2
train info: logloss loss:1.308419421989034
eval info: group_auc:0.6374, mean_mrr:0.2966, ndcg@10:0.389, ndcg@5:0.3227
at epoch 2 , train time: 1191.0 eval time: 146.5


step 3690 , total_loss: 1.2752, data_loss: 1.2860: : 3693it [19:34,  3.14it/s]
663it [00:01, 533.89it/s]
1143it [01:17, 14.80it/s]
73152it [00:08, 8520.96it/s]


at epoch 3
train info: logloss loss:1.2751428933444464
eval info: group_auc:0.6524, mean_mrr:0.3089, ndcg@10:0.403, ndcg@5:0.3375
at epoch 3 , train time: 1174.3 eval time: 148.0


step 3690 , total_loss: 1.2496, data_loss: 1.2016: : 3693it [19:30,  3.15it/s]
663it [00:01, 546.00it/s]
1143it [01:17, 14.77it/s]
73152it [00:08, 8825.41it/s]


at epoch 4
train info: logloss loss:1.2496127263821144
eval info: group_auc:0.6608, mean_mrr:0.3141, ndcg@10:0.4097, ndcg@5:0.345
at epoch 4 , train time: 1170.7 eval time: 146.2


step 3690 , total_loss: 1.2324, data_loss: 1.1904: : 3693it [19:31,  3.15it/s]
663it [00:01, 536.32it/s]
1143it [01:17, 14.73it/s]
73152it [00:08, 8473.84it/s]


at epoch 5
train info: logloss loss:1.2323803440394068
eval info: group_auc:0.6569, mean_mrr:0.3153, ndcg@10:0.4097, ndcg@5:0.3459
at epoch 5 , train time: 1171.3 eval time: 145.4


step 3690 , total_loss: 1.2157, data_loss: 1.1519: : 3693it [19:31,  3.15it/s]
663it [00:01, 538.28it/s]
1143it [01:17, 14.74it/s]
73152it [00:08, 8764.57it/s]


at epoch 6
train info: logloss loss:1.2157036740057454
eval info: group_auc:0.6586, mean_mrr:0.3193, ndcg@10:0.4136, ndcg@5:0.3507
at epoch 6 , train time: 1171.9 eval time: 144.9


step 3690 , total_loss: 1.1982, data_loss: 1.1477: : 3693it [19:31,  3.15it/s]
663it [00:01, 544.43it/s]
1143it [01:17, 14.74it/s]
73152it [00:08, 8807.37it/s]


at epoch 7
train info: logloss loss:1.1982101748508067
eval info: group_auc:0.6616, mean_mrr:0.3197, ndcg@10:0.4141, ndcg@5:0.3511
at epoch 7 , train time: 1171.2 eval time: 145.0


step 3690 , total_loss: 1.1832, data_loss: 1.0481: : 3693it [19:30,  3.15it/s]
663it [00:01, 536.41it/s]
1143it [01:17, 14.73it/s]
73152it [00:08, 8652.74it/s]


at epoch 8
train info: logloss loss:1.1832293422603684
eval info: group_auc:0.6625, mean_mrr:0.3202, ndcg@10:0.4146, ndcg@5:0.3509
at epoch 8 , train time: 1170.9 eval time: 145.6


step 500 , total_loss: 1.1546, data_loss: 1.2591: : 501it [02:39,  3.14it/s]


KeyboardInterrupt: ignored

In [11]:
%%time
res_syn = model.run_eval(valid_news_file, valid_behaviors_file)
print(res_syn)


663it [00:01, 498.37it/s]
1143it [01:17, 14.70it/s]
73152it [00:08, 8603.88it/s]


{'group_auc': 0.6607, 'mean_mrr': 0.3207, 'ndcg@5': 0.352, 'ndcg@10': 0.4144}
CPU times: user 2min 28s, sys: 11.4 s, total: 2min 40s
Wall time: 2min 26s


In [12]:
sb.glue("res_syn", res_syn)

## Save the model

In [13]:
model_path = os.path.join(data_path, "model")
os.makedirs(model_path, exist_ok=True)

model.model.save_weights(os.path.join(model_path, "nrms_ckpt"))

## Prediction


In [14]:
group_impr_indexes, group_labels, group_preds = model.run_fast_eval(valid_news_file, valid_behaviors_file)

663it [00:01, 483.29it/s]
1143it [01:18, 14.64it/s]
73152it [00:08, 8709.34it/s]


In [15]:
with open(os.path.join(data_path, 'prediction.txt'), 'w') as f:
    for impr_index, preds in tqdm(zip(group_impr_indexes, group_preds)):
        impr_index += 1
        pred_rank = (np.argsort(np.argsort(preds)[::-1]) + 1).tolist()
        pred_rank = '[' + ','.join([str(i) for i in pred_rank]) + ']'
        f.write(' '.join([str(impr_index), pred_rank])+ '\n')

73152it [00:01, 54746.82it/s]


In [16]:
f = zipfile.ZipFile(os.path.join(data_path, 'prediction.zip'), 'w', zipfile.ZIP_DEFLATED)
f.write(os.path.join(data_path, 'prediction.txt'), arcname='prediction.txt')
f.close()

## Reference
\[1\] Wu et al. "Neural News Recommendation with Multi-Head Self-Attention." in Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)<br>
\[2\] Wu, Fangzhao, et al. "MIND: A Large-scale Dataset for News Recommendation" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html <br>
\[3\] GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/projects/glove/ <br>
\[4\] https://github.com/microsoft/recommenders/