In [1]:
import os
import tempfile
import shutil
import urllib
import zipfile
from collections import Counter
import pandas as pd
from sklearn.metrics import roc_auc_score
from surprise import Dataset, NormalPredictor, Reader, SVD, SVDpp, accuracy
from surprise.model_selection import cross_validate, KFold, train_test_split
from microsoft.evaluation import (rmse, mae, rsquared, exp_var, auc, map_at_k, ndcg_at_k, precision_at_k, recall_at_k, get_top_k_items)
from microsoft.surprise_utils import predict, compute_ranking_predictions
from microsoft.python_splitters import python_random_split


# Temporary folder for data we need during execution of this notebook
temp_dir = os.path.join(tempfile.gettempdir(), 'mind')
os.makedirs(temp_dir, exist_ok=True)

# The dataset is split into training and validation set, each with a large and small version.
# The format of the four files are the same.
# For demonstration purpose, we will use small version validation set only.
base_url = 'https://mind201910small.blob.core.windows.net/release'
training_small_url = f'{base_url}/MINDsmall_train.zip'
validation_small_url = f'{base_url}/MINDsmall_dev.zip'
training_large_url = f'{base_url}/MINDlarge_train.zip'
validation_large_url = f'{base_url}/MINDlarge_dev.zip'

In [None]:
def download_url(url,
                 destination_filename=None,
                 progress_updater=None,
                 force_download=False,
                 verbose=True):
    """
    Download a URL to a temporary file
    """
    if not verbose:
        progress_updater = None
    # This is not intended to guarantee uniqueness, we just know it happens to guarantee
    # uniqueness for this application.
    if destination_filename is None:
        url_as_filename = url.replace('://', '_').replace('/', '_')
        destination_filename = \
            os.path.join(temp_dir,url_as_filename)
    if (not force_download) and (os.path.isfile(destination_filename)):
        if verbose:
            print('Bypassing download of already-downloaded file {}'.format(
                os.path.basename(url)))
        return destination_filename
    if verbose:
        print('Downloading file {} to {}'.format(os.path.basename(url),
                                                 destination_filename),
              end='')
    urllib.request.urlretrieve(url, destination_filename, progress_updater)
    assert (os.path.isfile(destination_filename))
    nBytes = os.path.getsize(destination_filename)
    if verbose:
        print('...done, {} bytes.'.format(nBytes))
    return destination_filename

In [None]:
# Download the small training data set
zip_path = download_url(training_small_url, force_download=True, verbose=True)
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(temp_dir)

os.listdir(temp_dir)

In [2]:
# The behaviors.tsv file contains the impression logs and users' news click histories. 
# It has 5 columns divided by the tab symbol:
# - Impression ID. The ID of an impression.
# - User ID. The anonymous ID of a user.
# - Time. The impression time with format "MM/DD/YYYY HH:MM:SS AM/PM".
# - History. The news click history (ID list of clicked news) of this user before this impression.
# - Impressions. List of news displayed in this impression and user's click behaviors on them (1 for click and 0 for non-click).
behaviors_path = os.path.join(temp_dir, 'behaviors.tsv')
df = pd.read_table(
    behaviors_path,
    header=None,
    names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

In [3]:
# group the impressions by user id
user_df = df.groupby(['user_id']).agg(lambda x: x.sum())

# separate out the articles in each impression
impression_table = user_df['impressions'].apply(lambda x : pd.Series(x.split(" ")))

# stack the impressions
stacked_imps = pd.DataFrame(impression_table.stack())
stacked_imps = stacked_imps.rename(columns = {0: "item_id"})
data = stacked_imps.reset_index()
data = data.drop("level_1", axis=1)

data

Unnamed: 0,user_id,item_id
0,U100,N61235-0
1,U100,N54489-0
2,U100,N42597-0
3,U100,N7800-1
4,U100,N61408-0
...,...,...
5736474,U9999,N41224-0
5736475,U9999,N27349-0
5736476,U9999,N61022-1
5736477,U9999,N11830-0


In [4]:
data["click"] = data["item_id"].apply(lambda x : x[-1])
data["item_id"] = data["item_id"].apply(lambda x : x[:-2])
# data frame with userid, itemid, click (0 or 1)
data

Unnamed: 0,user_id,item_id,click
0,U100,N61235,0
1,U100,N54489,0
2,U100,N42597,0
3,U100,N7800,1
4,U100,N61408,0
...,...,...,...
5736474,U9999,N41224,0
5736475,U9999,N27349,0
5736476,U9999,N61022,1
5736477,U9999,N11830,0


In [5]:
train, test = python_random_split(data, 0.75)

In [6]:
# load the data into a surprise Dataset structure
reader = Reader(rating_scale=(0, 1))
trainset = Dataset.load_from_df(train[["user_id", "item_id", "click"]], reader).build_full_trainset()

In [7]:
svd = SVD()

svd.fit(trainset)

# train, test = pd.DataFrame(trainset, columns=["user_id", "item_id", "click"]), pd.DataFrame(testset, columns=["user_id", "item_id", "click"])

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fef26bd6820>

In [8]:
predictions = predict(svd, test, usercol='user_id', itemcol='item_id')
# all_predictions = compute_ranking_predictions(svd, train, usercol='user_id', itemcol='item_id', remove_seen=True)
predictions.head()

Unnamed: 0,user_id,item_id,prediction
0,U49583,N17031,0.042131
1,U63308,N33831,0.013821
2,U19,N55132,0.021394
3,U78991,N47748,0.171044
4,U81537,N4912,0.0


In [9]:
eval_rmse = rmse(test, predictions, col_user="user_id", col_item="item_id", col_rating="click")
eval_mae = mae(test, predictions, col_user="user_id", col_item="item_id", col_rating="click")
eval_rsquared = rsquared(test, predictions, col_user="user_id", col_item="item_id", col_rating="click")
eval_exp_var = exp_var(test, predictions, col_user="user_id", col_item="item_id", col_rating="click")
eval_auc = auc(test, predictions, col_user="user_id", col_item="item_id", col_rating="click")

# k = 10
# eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=k)
# eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=k)
# eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=k)
# eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=k)


print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var,
      "AUC:\t%f" % eval_auc, sep='\n')

# print('----')

# print("MAP:\t%f" % eval_map,
#       "NDCG:\t%f" % eval_ndcg,
#       "Precision@K:\t%f" % eval_precision,
#       "Recall@K:\t%f" % eval_recall, sep='\n')

RMSE:		0.195397
MAE:		0.082135
rsquared:	-0.034765
exp var:	-0.029838
AUC:	0.674135


In [None]:
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(d):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    print(auc(testset, predictions))

# Run 5-fold cross-validation and print results
# cross_validate(algo, d, measures=["RMSE", "MAE"], cv=5, verbose=True)

In [None]:
for _ in range(3):
    algo = SVD()

    trainset, testset = train_test_split(d, test_size=0.1)

    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    preds = pd.DataFrame(predictions)
    preds = preds.rename(columns={"uid" : "user_id", "iid" : "item_id", "r_ui" : "click"})
    true = pd.DataFrame(testset, columns=["user_id", "item_id", "click"])
    print("AUC: " + str(auc(true, preds, col_user="user_id", col_item="item_id", col_rating="click", col_prediction="est")))
    print("\n")

# Compute and print Root Mean Squared Error
# roc_auc_score(testset, predictions, )