In [None]:
import os
import tempfile
import urllib
import zipfile

# Temporary folder for data we need during execution of this notebook (we'll clean up
# at the end, we promise)
temp_dir = os.path.join(tempfile.gettempdir(), 'mind')
os.makedirs(temp_dir, exist_ok=True)

# The dataset is split into training and validation set, each with a large and small version.
# The format of the four files are the same.
# For demonstration purpose, we will use small version validation set only.
base_url = 'https://mind201910small.blob.core.windows.net/release'
training_small_url = f'{base_url}/MINDsmall_train.zip'
validation_small_url = f'{base_url}/MINDsmall_dev.zip'
training_large_url = f'{base_url}/MINDlarge_train.zip'
validation_large_url = f'{base_url}/MINDlarge_dev.zip'

def download_url(url,
                 destination_filename=None,
                 progress_updater=None,
                 force_download=False,
                 verbose=True):
    """
    Download a URL to a temporary file
    """
    if not verbose:
        progress_updater = None
    # This is not intended to guarantee uniqueness, we just know it happens to guarantee
    # uniqueness for this application.
    if destination_filename is None:
        url_as_filename = url.replace('://', '_').replace('/', '_')
        destination_filename = \
            os.path.join(temp_dir,url_as_filename)
    if (not force_download) and (os.path.isfile(destination_filename)):
        if verbose:
            print('Bypassing download of already-downloaded file {}'.format(
                os.path.basename(url)))
        return destination_filename
    if verbose:
        print('Downloading file {} to {}'.format(os.path.basename(url),
                                                 destination_filename),
              end='')
    urllib.request.urlretrieve(url, destination_filename, progress_updater)
    assert (os.path.isfile(destination_filename))
    nBytes = os.path.getsize(destination_filename)
    if verbose:
        print('...done, {} bytes.'.format(nBytes))
    return destination_filename

zip_path = download_url(validation_small_url, verbose=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(temp_dir)

os.listdir(temp_dir)

news_path = os.path.join(temp_dir, 'news.tsv')
news_df = pd.read_table(news_path,
              header=None,
              names=[
                  'id', 'category', 'subcategory', 'title', 'abstract', 'url',
                  'title_entities', 'abstract_entities'
              ])

In [None]:
import torch
import torchtext

glove = torchtext.vocab.GloVe(name="6B", dim=50)

In [None]:
def get_category_similarity(nid_1, nid_2):
    if nid_1 not in news_df.index or nid_2 not in news_df.index:
        return 0
    
    cat1 = news_df.loc[nid_1]["category"]
    cat2 = news_df.loc[nid_2]["category"]
    
    return 1 - torch.cosine_similarity(glove[cat1].unsqueeze(0), glove[cat2].unsqueeze(0)).item()


def diversity_user(recs):
    score = 0
    count = 0
    for i in range(len(recs)):
        for j in range(i+1, len(recs)):
            count += 1
            score += get_category_similarity(recs[i], recs[j])
    return score/count
        
# Calculates recommendations according to mmr for all users
# df is a Pandas dataframe with cols user, news_id, pred where pred[i] is the relevance score for news_id[i]
# lamda is a weight parameter
# k is how many items should be in the recommendation; assume k >= 1
def diversity_eval(df):
    diversity_score = 0
    count = 0
    for index, row in df.iterrows():
        diversity_score += diversity_user(row['news_id'])
        count += 1
    return diversity_score/count