In [41]:
import os
import tempfile
import shutil
import urllib
import zipfile
from collections import Counter
import pandas as pd
from surprise import Dataset, NormalPredictor, Reader, SVD, SVDpp, accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold


# Temporary folder for data we need during execution of this notebook
temp_dir = os.path.join(tempfile.gettempdir(), 'mind')
os.makedirs(temp_dir, exist_ok=True)

# The dataset is split into training and validation set, each with a large and small version.
# The format of the four files are the same.
# For demonstration purpose, we will use small version validation set only.
base_url = 'https://mind201910small.blob.core.windows.net/release'
training_small_url = f'{base_url}/MINDsmall_train.zip'
validation_small_url = f'{base_url}/MINDsmall_dev.zip'
training_large_url = f'{base_url}/MINDlarge_train.zip'
validation_large_url = f'{base_url}/MINDlarge_dev.zip'

In [24]:
def download_url(url,
                 destination_filename=None,
                 progress_updater=None,
                 force_download=False,
                 verbose=True):
    """
    Download a URL to a temporary file
    """
    if not verbose:
        progress_updater = None
    # This is not intended to guarantee uniqueness, we just know it happens to guarantee
    # uniqueness for this application.
    if destination_filename is None:
        url_as_filename = url.replace('://', '_').replace('/', '_')
        destination_filename = \
            os.path.join(temp_dir,url_as_filename)
    if (not force_download) and (os.path.isfile(destination_filename)):
        if verbose:
            print('Bypassing download of already-downloaded file {}'.format(
                os.path.basename(url)))
        return destination_filename
    if verbose:
        print('Downloading file {} to {}'.format(os.path.basename(url),
                                                 destination_filename),
              end='')
    urllib.request.urlretrieve(url, destination_filename, progress_updater)
    assert (os.path.isfile(destination_filename))
    nBytes = os.path.getsize(destination_filename)
    if verbose:
        print('...done, {} bytes.'.format(nBytes))
    return destination_filename

In [10]:
# Download the small training data set
zip_path = download_url(training_small_url, force_download=True, verbose=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(temp_dir)

os.listdir(temp_dir)

Downloading file MINDsmall_train.zip to /var/folders/yw/dt283m09041fwfpmn957qbt80000gn/T/mind/https_mind201910small.blob.core.windows.net_release_MINDsmall_train.zip...done, 52952752 bytes.


['behaviors.tsv',
 'news.tsv',
 'https_mind201910small.blob.core.windows.net_release_MINDsmall_train.zip',
 'MINDsmall_train.zip',
 'MINDsmall_dev.zip',
 'entity_embedding.vec',
 'relation_embedding.vec']

In [33]:
# The behaviors.tsv file contains the impression logs and users' news click histories. 
# It has 5 columns divided by the tab symbol:
# - Impression ID. The ID of an impression.
# - User ID. The anonymous ID of a user.
# - Time. The impression time with format "MM/DD/YYYY HH:MM:SS AM/PM".
# - History. The news click history (ID list of clicked news) of this user before this impression.
# - Impressions. List of news displayed in this impression and user's click behaviors on them (1 for click and 0 for non-click).
behaviors_path = os.path.join(temp_dir, 'behaviors.tsv')
df = pd.read_table(
    behaviors_path,
    header=None,
    names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

In [34]:
# group the impressions by user id
user_df = df.groupby(['user_id']).agg(lambda x: x.sum())

# separate out the articles in each impression
impression_table = user_df['impressions'].apply(lambda x : pd.Series(x.split(" ")))

# stack the impressions
stacked_imps = pd.DataFrame(impression_table.stack())
stacked_imps = stacked_imps.rename(columns = {0: "item_id"})
data = stacked_imps.reset_index()
data = data.drop("level_1", axis=1)

data

Unnamed: 0,user_id,item_id
0,U100,N61235-0
1,U100,N54489-0
2,U100,N42597-0
3,U100,N7800-1
4,U100,N61408-0
...,...,...
5736474,U9999,N41224-0
5736475,U9999,N27349-0
5736476,U9999,N61022-1
5736477,U9999,N11830-0


In [35]:
data["click"] = data["item_id"].apply(lambda x : x[-1])
data["item_id"] = data["item_id"].apply(lambda x : x[:-2])
# data frame with userid, itemid, click (0 or 1)
data

Unnamed: 0,user_id,item_id,click
0,U100,N61235,0
1,U100,N54489,0
2,U100,N42597,0
3,U100,N7800,1
4,U100,N61408,0
...,...,...,...
5736474,U9999,N41224,0
5736475,U9999,N27349,0
5736476,U9999,N61022,1
5736477,U9999,N11830,0


In [36]:
# load the data into a surprise Dataset structure
reader = Reader(rating_scale=(0, 1))
d = Dataset.load_from_df(data[["user_id", "item_id", "click"]], reader)

In [38]:
kf = KFold(n_splits=3)

algo = SVD()

# for trainset, testset in kf.split(d):

#     # train and test algorithm.
#     algo.fit(trainset)
#     predictions = algo.test(testset)

#     # Compute and print Root Mean Squared Error
#     accuracy.rmse(predictions, verbose=True)

# Run 5-fold cross-validation and print results
cross_validate(algo, d, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1972  0.1976  0.1974  0.1968  0.1966  0.1971  0.0003  
MAE (testset)     0.0828  0.0828  0.0826  0.0825  0.0827  0.0827  0.0001  
Fit time          112.87  109.65  91.58   118.82  132.70  113.13  13.36   
Test time         67.58   57.00   82.77   153.70  177.51  107.71  48.56   


{'test_rmse': array([0.19719213, 0.19756628, 0.19738237, 0.19682735, 0.19662023]),
 'test_mae': array([0.08280729, 0.08281411, 0.08263376, 0.0825046 , 0.0826877 ]),
 'fit_time': (112.87474822998047,
  109.65092611312866,
  91.57648611068726,
  118.8233060836792,
  132.70151209831238),
 'test_time': (67.58389234542847,
  56.99598002433777,
  82.76572227478027,
  153.6970407962799,
  177.50745224952698)}

In [42]:
kf = KFold(n_splits=3)

algo = SVDpp()

for trainset, testset in kf.split(d):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)

KeyboardInterrupt: 