In [None]:
import os
import json
import pickle

import sys
import scrapbook as sb
import pandas as pd
import numpy as np
import random
from fastparquet import ParquetFile
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

In [None]:
def evaluation_metrics(model,predicted_ranking, targets):
    recall = list()
    ndcg = list()
    for user_id in targets.user_id.unique():
        # we compute recall and ndcg for each user and then aggregate
        user_predicted_ranking = predicted_ranking[predicted_ranking.user_id==user_id].item_id.values
        user_targets = targets[targets.user_id==user_id].item_id.values
        
        # compute recall
        num_hit = len(set(user_predicted_ranking).intersection(set(user_targets)))
        user_recall = float(num_hit) / len(user_targets)
        recall.append(user_recall)
        
        # relevance to compute ndcg (recommendations and ideal)
        recom_relevance = list()
        for item_id in user_predicted_ranking:
            if item_id in user_targets:
                recom_relevance.append(1.)
            else:
                recom_relevance.append(0.)
        ideal_relevance = -np.sort(-np.array(recom_relevance))
        
        # compute ndcg
        if np.sum(recom_relevance)==0.0:
            ndcg.append(0.0)
        else:
            recom_dcg = np.sum(recom_relevance/np.log2(1+np.arange(1,len(recom_relevance)+1)))
            ideal_dcg = np.sum(ideal_relevance/np.log2(1+np.arange(1,len(ideal_relevance)+1)))
            ndcg.append(recom_dcg/ideal_dcg)

    recall = np.array(recall).squeeze()
    ndcg = np.array(ndcg).squeeze()
    
    # compute loss function
    loss = compute_loss_function(model)

    return {'recall':np.mean(recall), 'ndcg':np.mean(ndcg), 'loss':loss}

In [None]:
def compute_loss_function(model):
    # initialize
    loss = 0.0

    # calculate loss and update NCF parameters
    n_batch = model.data.train.shape[0] // model.batch_size + 1
    for idx in range(n_batch):
        users, pos_items, neg_items = model.data.train_loader(model.batch_size)
        
        batch_loss = model.sess.run(model.loss, feed_dict={
            model.users: users,
            model.pos_items: pos_items,
            model.neg_items: neg_items
        })


        # get loss and execute optimization
        loss += batch_loss / n_batch
        
    return loss

In [None]:
def get_top_k(pred_df, k, column):
    """
    Get the top-k items for each user according to the prediction score
    """
    top_k_df = pred_df.groupby('userID').apply(lambda x: x.nlargest(k, column)).reset_index(drop=True)
    return top_k_df

In [None]:
def random_recommend(users,training_df,k):
    predictions_df = pd.DataFrame(columns=['userID','itemID','prediction'])
    for user_id in users:
        items_to_sample = set(all_items).difference(training_df[training_df['userID']==user_id].itemID.unique())
        items = random.sample(list(items_to_sample), k)
        tmp_df = pd.DataFrame(columns=['userID','itemID','prediction'])
        tmp_df['itemID'] = list(items)
        tmp_df['userID'] = user_id
        tmp_df['prediction'] = 1.0
        predictions_df = pd.concat([predictions_df, tmp_df.sort_values('prediction', ascending=False)[:k]])
    return predictions_df
        

### MODEL PARAMETERS

In [None]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '1m'

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "lightgcn.yaml"

### TRAIN ORIGINAL DATASET

In [None]:
train = pd.read_csv('data/original_dataset/train_df.csv')
all_items = list(train.item_id.unique())
test = pd.read_csv('data/original_dataset/test_df.csv')

In [None]:
train['rating'] = 1
test['rating'] = 1

In [None]:
train.user_id.nunique()

In [None]:
train.item_id.nunique()

In [None]:
train = train.rename(columns={'user_id':'userID','item_id':'itemID'})
test = test.rename(columns={'user_id':'userID','item_id':'itemID'})

In [None]:
privacy_type = 'original_dataset' 
# directory = f'data/privacy/{privacy_type}'
output_directory = f'output/{privacy_type}'
rec_filename = f'LightGCN-{privacy_type}'
data = ImplicitCF(train=train, test=test, seed=SEED)

hparams = prepare_hparams(yaml_file,
                          n_layers=3,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.005,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )

model = LightGCN(hparams, data, seed=SEED)

with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

# predict
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)

#output_filename = f'model-{rec_filename}'
#model.save(dir_name=os.path.join(output_directory, 'models', output_filename))

# save recommendations
output_filename = f'recommendations-{rec_filename}.parq'
topk_scores.to_parquet(os.path.join(output_directory, 'recommendations', output_filename), 
                               engine='fastparquet')

del model
topk_scores = topk_scores.rename(columns={'userID':'user_id', 'itemID':'item_id'})
test_to_eval = test.rename(columns={'userID':'user_id', 'itemID':'item_id'})

# evaluation metrics
metrics = evaluation_metrics(topk_scores, test_to_eval)
output_filename = f'evaluation-metrics-{rec_filename}.json'
with open(os.path.join(output_directory, 'evaluation-metrics' ,output_filename), "w") as fp:
    json.dump(metrics, fp)

del topk_scores

### TRAIN PRIVACY ENHANCED DATASETS

In [None]:
# data privacy
privacy_type = 'with_mask/fixed-response' 
directory = f'data/{privacy_type}'
output_directory = f'output/privacy_data/{privacy_type}'
rec_filename = f'LightGCN-{privacy_type}'

groups_to_train = ['1','2','3','5','7','10','11','13','17','19']
thetas_to_train = ['0.2','0.4','0.6','0.8']


for filename in os.listdir(directory):
    file_name = os.path.join(directory, filename)
    # get the type of data from filename
    data_type = filename.split('.parq')[0]
    group = data_type.split('groups')[1].split('_')[0]
    theta = data_type.split('groups')[1].split('theta')[1].split('_')[0]
    print(group, theta)
    if group in groups_to_train and theta in thetas_to_train:
        # load test every time
        test = pd.read_csv('data/original_dataset/test_df.csv')
        test = test.rename(columns={'user_id':'userID','item_id':'itemID'})
        test['rating'] = 1
        # read parquet, convert to pandas and filter positive interactions
        parq_df = ParquetFile(file_name)
        pd_df = parq_df.to_pandas()
        train_df = pd_df[pd_df['interaction_r']==1][['user_id','item_id','interaction_r']].rename(columns={'user_id':'userID','item_id':'itemID','interaction_r':'rating'})
        if train_df.isna().sum().sum()!=0:
            #nan_userid.setdefault(data_type, train_df[train_df.isnull().any(axis=1)].userID.unique())
            train_df.dropna(inplace=True)
        train_df = train_df.reset_index(drop=True)
        # we need to make sure that we have not added interactions that are part of the test set
        tmp = train_df.merge(test[['userID','itemID','rating']], on=['userID','itemID','rating'])
        train_df = pd.merge(train_df, tmp, on=['userID','itemID','rating'], how='outer', indicator=True).query("_merge != 'both'").drop('_merge', axis=1).reset_index(drop=True)
        data = ImplicitCF(train=train_df, seed=SEED)
        hparams = prepare_hparams(yaml_file,
                                  n_layers=3,
                                  batch_size=BATCH_SIZE,
                                  epochs=EPOCHS,
                                  learning_rate=0.005,
                                  eval_epoch=-1,
                                  top_k=TOP_K,
                                 )
        # create model
        model = LightGCN(hparams, data, seed=SEED)
        
        # train
        with Timer() as train_time:
            model.fit()
        print("Took {} seconds for training.".format(train_time.interval))
        
        # predict
        test_specific_users = test[test.userID.isin(train_df.userID.unique())]
        topk_scores = model.recommend_k_items(test_specific_users, top_k=TOP_K, remove_seen=True)
        # for other users not present in train_df, give random recommendations
        users_random = list(set(test.userID.unique()).difference(train_df.userID.unique()))
        topk_scores_random = random_recommend(users_random, train_df,TOP_K)
        topk_scores = pd.concat([topk_scores, topk_scores_random], ignore_index=True, sort=False)
        #topk_scores.replace({'userID':reverse_map_dict_users}, inplace=True)
        #test.replace({'userID':reverse_map_dict_users}, inplace=True)

        # save recommendations
        output_filename = f'recommendations-LightGCN-{data_type}.parq'
        topk_scores.to_parquet(os.path.join(output_directory, 'recommendations', output_filename), 
                               engine='fastparquet')

        topk_scores = topk_scores.rename(columns={'userID':'user_id', 'itemID':'item_id'})
        test_to_eval = test.rename(columns={'userID':'user_id', 'itemID':'item_id'})

        # evaluation metrics
        metrics = evaluation_metrics(model,topk_scores, test_to_eval)
        output_filename = f'evaluation-metrics-LightGCN-{data_type}.json'
        with open(os.path.join(output_directory, 'evaluation-metrics' ,output_filename), "w") as fp:
            json.dump(metrics, fp)

        del topk_scores
        del model


        

