In [None]:
import os
import json
import pickle
import random

import sys
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)

import numpy as np
import pandas as pd
from fastparquet import ParquetFile

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

In [None]:
def evaluation_metrics(model,predicted_ranking, targets):
    recall = list()
    ndcg = list()
    for user_id in targets.user_id.unique():
        # we compute recall and ndcg for each user and then aggregate
        user_predicted_ranking = predicted_ranking[predicted_ranking.user_id==user_id].item_id.values
        user_targets = targets[targets.user_id==user_id].item_id.values
        
        # compute recall
        num_hit = len(set(user_predicted_ranking).intersection(set(user_targets)))
        user_recall = float(num_hit) / len(user_targets)
        recall.append(user_recall)
        
        # relevance to compute ndcg (recommendations and ideal)
        recom_relevance = list()
        for item_id in user_predicted_ranking:
            if item_id in user_targets:
                recom_relevance.append(1.)
            else:
                recom_relevance.append(0.)
        ideal_relevance = -np.sort(-np.array(recom_relevance))
        
        # compute ndcg
        if np.sum(recom_relevance)==0.0:
            ndcg.append(0.0)
        else:
            recom_dcg = np.sum(recom_relevance/np.log2(1+np.arange(1,len(recom_relevance)+1)))
            ideal_dcg = np.sum(ideal_relevance/np.log2(1+np.arange(1,len(ideal_relevance)+1)))
            ndcg.append(recom_dcg/ideal_dcg)

    recall = np.array(recall).squeeze()
    ndcg = np.array(ndcg).squeeze()
    
    # compute loss function
    loss = compute_loss_function(model)

    return {'recall':np.mean(recall), 'ndcg':np.mean(ndcg), 'loss':loss}

In [None]:
def get_top_k(pred_df, k, column):
    """
    Get the top-k items for each user according to the prediction score
    """
    top_k_df = pred_df.groupby('userID').apply(lambda x: x.nlargest(k, column)).reset_index(drop=True)
    return top_k_df

In [None]:
def random_recommend(users,training_df,k):
    predictions_df = pd.DataFrame(columns=['userID','itemID','prediction'])
    for user_id in users:
        items_to_sample = set(all_items).difference(training_df[training_df['userID']==user_id].itemID.unique())
        items = random.sample(list(items_to_sample), k)
        tmp_df = pd.DataFrame(columns=['userID','itemID','prediction'])
        tmp_df['itemID'] = list(items)
        tmp_df['userID'] = user_id
        tmp_df['prediction'] = 1.0
        predictions_df = pd.concat([predictions_df, tmp_df.sort_values('prediction', ascending=False)[:k]])
    return predictions_df
        

In [None]:
def compute_loss_function(model):
    # initialize
    train_loss = []

    # calculate loss and update NCF parameters
    for user_input, item_input, labels in data.train_loader(model.batch_size):

        user_input = np.array([model.user2id[x] for x in user_input])
        item_input = np.array([model.item2id[x] for x in item_input])
        labels = np.array(labels)

        feed_dict = {
            model.user_input: user_input[..., None],
            model.item_input: item_input[..., None],
            model.labels: labels[..., None],
        }

        # get loss and execute optimization
        loss = model.sess.run(model.loss, feed_dict)
        train_loss.append(loss)
        
    return sum(train_loss) / len(train_loss)

### MODEL PARAMETERS

In [None]:
# top k items to recommend
TOP_K = 10

# Model parameters
EPOCHS = 50
BATCH_SIZE = 256

SEED = 42

### TRAIN ORIGINAL DATASETS

In [None]:
train = pd.read_csv('data/original_dataset/train_df.csv')
all_items = list(train.item_id.unique())
test = pd.read_csv('data/original_dataset/test_df.csv')

In [None]:
train['rating'] = 1
test['rating'] = 1

In [None]:
train = train.rename(columns={'user_id':'userID','item_id':'itemID'})
test = test.rename(columns={'user_id':'userID','item_id':'itemID'})

In [None]:
train = train.sort_values('userID')
test = test.sort_values('userID')

In [None]:
train_file = "MS_data/train.csv"
test_file = "MS_data/test.csv"
#train.to_csv(train_file, index=False)
#test.to_csv(test_file, index=False)

In [None]:
privacy_type = 'original_dataset' 
output_directory = f'output/{privacy_type}'
rec_filename = f'NCF-{privacy_type}'

data = NCFDataset(train_file=train_file, seed=SEED)

model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time))

with Timer() as test_time:
    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time))

reduced_predictions = get_top_k(all_predictions,10,'prediction')

output_filename = f'model-{rec_filename}'
model.save(dir_name=output_filename)
    
# save recommendations
output_filename = f'recommendations-{rec_filename}.parq'
reduced_predictions.to_parquet(os.path.join(output_directory, 'recommendations', output_filename), 
                               engine='fastparquet')


reduced_predictions = reduced_predictions.rename(columns={'userID':'user_id', 'itemID':'item_id'})
test_to_eval = test.rename(columns={'userID':'user_id', 'itemID':'item_id'})


# evaluation metrics
metrics = evaluation_metrics(model,reduced_predictions, test_to_eval)
output_filename = f'evaluation-metrics-{privacy_type}.json'
with open(os.path.join(output_directory, 'evaluation-metrics' ,output_filename), "w") as fp:
    json.dump(metrics, fp)
del model
del reduced_predictions

### TRAIN PRIVACY ENHANCED DATASETS

In [None]:
# data privacy
for response in ['fixed']:
    privacy_type = f'with_mask/{response}-response' 
    print(privacy_type)
    directory = f'data/{privacy_type}'
    output_directory = f'output/privacy_data/{privacy_type}'
    rec_filename = f'NCF-{privacy_type}'

    groups_to_train = ['1','2','3','5','7','10','11','13','17','19']
    thetas_to_train = ['0.2','0.4','0.6','0.8']

    for filename in os.listdir(directory):
        file_name = os.path.join(directory, filename)
        # get the type of data from filename
        data_type = filename.split('.parq')[0]
        group = data_type.split('groups')[1].split('_')[0]
        theta = data_type.split('groups')[1].split('theta')[1].split('_')[0]
        #print(group, theta)
        combination = (group, theta)
        if group in groups_to_train and theta in thetas_to_train:
            print(group, theta)
            # read parquet, convert to pandas and filter positive interactions
            parq_df = ParquetFile(file_name)
            pd_df = parq_df.to_pandas()
            train_df = pd_df[pd_df['interaction_r']==1][['user_id','item_id','interaction_r']].rename(columns={'user_id':'userID','item_id':'itemID','interaction_r':'rating'})
            if train_df.isna().sum().sum()!=0:
                train_df.dropna(inplace=True)
            # make sure to not include test interactions in the training set
            tmp = train_df.merge(test[['userID','itemID','rating']], on=['userID','itemID','rating'])
            train_df = pd.merge(train_df, tmp, on=['userID','itemID','rating'], how='outer', indicator=True).query("_merge != 'both'").drop('_merge', axis=1).reset_index(drop=True)
            train_df = train_df.sort_values('userID')
            train_df['userID'] = train_df.userID.astype(int)


            # create train/test files
            train_file = f"MS_data/train-{data_type}.csv"
            test_file = f"MS_data/test-{data_type}.csv"
            train_df.to_csv(train_file, index=False)
            test.to_csv(test_file, index=False)

            data = NCFDataset(train_file=train_file, seed=SEED)
            model = NCF(
                n_users=data.n_users, 
                n_items=data.n_items,
                model_type="NeuMF",
                n_factors=4,
                layer_sizes=[16,8,4],
                n_epochs=EPOCHS,
                batch_size=BATCH_SIZE,
                learning_rate=1e-3,
                verbose=10,
                seed=SEED
            )

            # train
            with Timer() as train_time:
                model.fit(data)
            print("Took {} seconds for training.".format(train_time.interval))

            # predict
            with Timer() as test_time:
                users, items, preds = [], [], []
                item = list(train_df.itemID.unique())
                for user in train_df.userID.unique():
                    user = [user] * len(item) 
                    users.extend(user)
                    items.extend(item)
                    preds.extend(list(model.predict(user, item, is_list=True)))

                all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

                merged = pd.merge(train_df, all_predictions, on=["userID", "itemID"], how="outer")
                all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

            print("Took {} seconds for prediction.".format(test_time))

            reduced_predictions = get_top_k(all_predictions,10,'prediction')
            # for other users not present in train_df, give random recommendations
            users_random = list(set(test.userID.unique()).difference(train_df.userID.unique()))
            topk_scores_random = random_recommend(users_random, train_df,10)
            reduced_predictions = pd.concat([reduced_predictions, topk_scores_random], ignore_index=True, sort=False)
            reduced_predictions = reduced_predictions.sort_values('userID')

            # save model
            output_filename = f'model-NCF-{data_type}'
            model.save(dir_name=os.path.join(output_directory, 'models', output_filename))

            # save recommendations
            output_filename = f'recommendations-NCF-{data_type}.parq'
            reduced_predictions.to_parquet(os.path.join(output_directory, 'recommendations', output_filename), 
                                           engine='fastparquet')

            reduced_predictions = reduced_predictions.rename(columns={'userID':'user_id', 'itemID':'item_id'})
            test_to_eval = test.rename(columns={'userID':'user_id', 'itemID':'item_id'})

            # evaluation metrics
            metrics = evaluation_metrics(model,reduced_predictions, test_to_eval)
            output_filename = f'evaluation-metrics-NCF-{data_type}.json'
            with open(os.path.join(output_directory, 'evaluation-metrics' ,output_filename), "w") as fp:
                json.dump(metrics, fp)

            del reduced_predictions
            del model




