# This notebook is used to compute popularity bias measure metrics

In [1]:
import os
import shutil
import sys
import numpy as np
from scipy import sparse
import pandas as pd
import scipy.stats

from support_func import *
from evaluation_func import *

In [23]:
raw_data = pd.read_csv('../raw_data/ml-1m/ratings.csv', header=0)
sorted_movieId = pd.read_csv('../processed_data/sorted_movieId.csv',header=None,index_col=0)
unique_sid = pd.unique(raw_data['movieId'])

In [24]:
id2show = dict((i, sid) for (i, sid) in enumerate(unique_sid)) 

In [25]:
def top_k_recList(predictions, id2show, k=10):
    n_users = predictions.shape[0]
    topk_movieId = []
    idx_topk_part = bn.argpartition(-predictions, k, axis=1)[:,:k].tolist()
    for i_pred in idx_topk_part:
        topk_movieId.append(list(map(lambda x: id2show[x], i_pred)))
    return topk_movieId

In [11]:
# read the corresponding user group with their taste distribution
G1_user = pd.read_csv('../processed_data/group_data/G1_user.csv',header=0,index_col=0)
G2_user = pd.read_csv('../processed_data/group_data/G2_user.csv',header=0,index_col=0)
G3_user = pd.read_csv('../processed_data/group_data/G3_user.csv',header=0,index_col=0)

Replace the path to saved prediction results and repeat the following cells

In [54]:
path2results = 'results/ml_1m_holdout_user_LT_GAN_100.0_0.0001/'

In [55]:
pred_all_test = np.loadtxt(path2results +'pred_all_test.txt')

In [56]:
topk_all_test = top_k_recList(pred_all_test,id2show)
topk_all_test = pd.DataFrame(topk_all_test, columns = range(1,11))

In [57]:
ARP_at_10 = ARP(topk_all_test,sorted_movieId)
Agg_Div_at_10 = Agg_Div(topk_all_test,sorted_movieId)
print('ARP@10: {0:.4f}, Agg-Div@10: {1:.4f}'.format(ARP_at_10,Agg_Div_at_10))

ARP@10: 0.2394, Agg-Div@10: 0.3160


In [58]:
unique_uid = np.loadtxt(path2results +'user_li.txt')

In [59]:
id2profile = dict((i, uid) for (i, uid) in enumerate(unique_uid)) 

In [60]:
topk_all_test['userId'] = list(map(lambda x: id2profile[x], topk_all_test.index))

In [61]:
G1_topk = topk_all_test[topk_all_test['userId'].isin(G1_user.index)]
G2_topk = topk_all_test[topk_all_test['userId'].isin(G2_user.index)]
G3_topk = topk_all_test[topk_all_test['userId'].isin(G3_user.index)]

In [62]:
for topk_rec in [G1_topk, G2_topk, G3_topk]:
    ARP_at_10 = ARP(topk_rec[range(1,11)],sorted_movieId)
    Agg_Div_at_10 = Agg_Div(topk_rec[range(1,11)],sorted_movieId)
    print('ARP@10: {0:.4f}, Agg-Div@10: {1:.4f}'.format(ARP_at_10,Agg_Div_at_10))

ARP@10: 0.2358, Agg-Div@10: 0.2129
ARP@10: 0.2413, Agg-Div@10: 0.2858
ARP@10: 0.2380, Agg-Div@10: 0.2016


In [63]:
G1_topk.to_csv(path2results + 'G1_topk.csv',header=True, index=True)
G2_topk.to_csv(path2results +'G2_topk.csv',header=True, index=True)
G3_topk.to_csv(path2results +'G3_topk.csv',header=True, index=True)

In [64]:
G1_user_taste = G1_user.loc[G1_topk['userId']].to_numpy()
G2_user_taste = G2_user.loc[G2_topk['userId']].to_numpy()
G3_user_taste = G3_user.loc[G3_topk['userId']].to_numpy()

The user taset of different group is manually computed using Excel. The idea of this computation is to find the probaility of each category of movies which presented in the recommendations list for users.

In [65]:
G1_rec_taste = pd.read_csv(path2results +'G1_rec_taste.csv',header=None).to_numpy()
G2_rec_taste = pd.read_csv(path2results +'G2_rec_taste.csv',header=None).to_numpy()
G3_rec_taste = pd.read_csv(path2results +'G3_rec_taste.csv',header=None).to_numpy()

In [66]:
print('G1 UDP:{0:.4f}'.format(group_UDP(G1_user_taste,G1_rec_taste)))
print('G2 UDP:{0:.4f}'.format(group_UDP(G2_user_taste,G2_rec_taste)))
print('G3 UDP:{0:.4f}'.format(group_UDP(G3_user_taste,G3_rec_taste)))

G1 UDP:0.1964
G2 UDP:0.1356
G3 UDP:0.1483
