In [1]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
from src.array_util import data_to_sparse
from src.data_io import get_dataset
from src.io import load_pickle
from src.models import save_model_multinomials

In [2]:
DATA_DIR = '/data/yueliu/Recommendation/data/last_basket_data' 
MODEL_DIR = '/data/yueliu/Recommendation/model'
# train models & save the models
save_model_multinomials()

In [3]:
dataset_name = 'MFP47K_last basket'
data_dir = DATA_DIR
results_dir = MODEL_DIR
file_name = os.path.join(DATA_DIR, dataset_name, "item_ref.csv")
item_index_ref = pd.read_csv(file_name, header=None).set_index(0).to_dict()[1]
file_name = os.path.join(DATA_DIR, dataset_name, "user_ref.csv")
user_index_ref = pd.read_csv(file_name, header=None).set_index(0).to_dict()[1]
mapping = {'user':user_index_ref, 'item':item_index_ref}

In [4]:
def collect_user_results(dataset_name, cols, k=10):
    user_results = defaultdict(dict)
    train, val, test = get_dataset(dataset_name, data_dir)
    dense_test = data_to_sparse(test).todense()   
    all_multinomials = {model_type:load_multinomials(model_type, dataset_name) 
                    for model_type in cols}
    
    users = range(dense_test.shape[0])
    for user in users:
        user_test[dataset_name].update({user:get_test(user, dense_test)})
        # predictions
        for model_type in cols:
            user_results[user].update({model_type:top_predictions(model_type, user, 
                 all_multinomials, dataset_name, k)})
    return user_results

def load_multinomials(model_type, dataset_name):
    if model_type == 'random_model':
        file_name = os.path.join(results_dir, 'global_model', dataset_name, 'user_multinomials.pkl')
        shape = load_pickle(file_name).shape
        # add up to 1
        np.random.seed(0)
        multi = np.random.rand(*shape)
        multi = multi / multi.sum(axis=0)
        return multi
    else:
        file_name = os.path.join(results_dir, model_type, dataset_name, 'user_multinomials.pkl')
    return load_pickle(file_name) 

def get_test(user, dense_test):
    user_test = dense_test[user].A1
    ind_test = np.where(user_test >=1)[0]
    return ind_test

def top_predictions(model_type, user, all_multinomials, dataset_name, k):
    user_multinomial = all_multinomials[model_type][user]
    val = np.sort(user_multinomial)[-k]
    ind = np.where(user_multinomial >= val)[0]
    ranks = np.unique(-user_multinomial, return_inverse=True)[1] + 1
    ind_test = user_test[dataset_name][user]
    pred_true = np.isin(ind, ind_test)
    pred = sorted(zip(ind, 
                      pred_true,
                      ranks[ind], 
                     ), key=lambda x: x[-1])
    return pred


def consolidate_results(dataset_name, user_results, cols):
    mapping = load_pickle(os.path.join(data_dir, dataset_name, "mapping.pkl"))
    # original username(str):  current index in prediction(int, from 0),
    inv_user_mapping = mapping['user']
    inv_item_mapping = mapping['item']

    df = pd.DataFrame.from_dict(user_results, orient='index')[cols]
    df['user'] = df.index
    df['username'] = df['user'].apply(lambda s: inv_user_mapping.get(s, -1))
    df = df[df['username']!=-1]

    df = df.melt(id_vars=['username'], value_vars=cols)
    df = df.rename(columns={'variable':'models'})
    df = df['value'].apply(pd.Series) \
        .merge(df, right_index = True, left_index = True) \
        .drop(["value"], axis = 1) \
        .melt(id_vars = ['username', 'models'], value_name = "prediction") \
        .drop("variable", axis = 1) \
        .dropna()

    df['item_idx'], df['is_positive'], df['pred_rank'] = zip(*df['prediction'])
    df['item'] = df['item_idx'].apply(lambda s: inv_item_mapping.get(s, -1))
    new_cols = ['username','models', 'pred_rank','item_idx','item', 'is_positive']
    df = df[new_cols]

    df = df.sort_values(by=new_cols).reset_index(drop=True)
    return df

In [6]:
user_test = defaultdict(dict)
user_results = defaultdict(dict)

dataset_name = 'MFP47K_last basket'
cols = ['random_model', 'global_model', 'personal_model', 'mixture_model', 'mixture_decay_model', 
        'nmf_model', 'wrmf_model', 'lda_model', 'bpr_model', 'fpmc_model',
        'adaloyal_model',  'sasrec_model']
user_results = collect_user_results(dataset_name, cols, k=10)

In [7]:
df = consolidate_results(dataset_name, user_results, cols)
df = df['username', 'models', 'pred_rank', 'item_idx']
path = os.path.join('\data\temp\UserStudy1', 'prediction for users.csv')
df.to_csv(path, index=False)