In [16]:
!pip install --quiet cornac==1.14.1

**Load Dataset**

In [28]:
from cornac.datasets import movielens

ratings = movielens.load_feedback()
# trust = filmtrust.load_trust()

In [29]:
import cornac
from cornac.datasets import movielens
from cornac.eval_methods import RatioSplit

ratio_split = RatioSplit(data=movielens.load_feedback(variant='1M'),
                         test_size=0.2,
                         exclude_unknowns=False,
                         verbose=True)

global_avg = cornac.models.GlobalAvg()
mf = cornac.models.MF(k=10, max_iter=25, learning_rate=0.01, lambda_reg=0.02,
                      use_bias=True, early_stop=True, verbose=True)

mae = cornac.metrics.MAE()
rmse = cornac.metrics.RMSE()

exp = cornac.Experiment(eval_method=ratio_split,
                        models=[global_avg, mf],
                        metrics=[mae, rmse],
                        user_based=True)
exp.run()

rating_threshold = 1.0
exclude_unknowns = False
---
Training data:
Number of users = 6040
Number of items = 3671
Number of ratings = 800167
Max rating = 5.0
Min rating = 1.0
Global mean = 3.6
---
Test data:
Number of users = 6034
Number of items = 3470
Number of ratings = 200042
Number of unknown users = 0
Number of unknown items = 35
---
Total users = 6040
Total items = 3706

[GlobalAvg] Training started!

[GlobalAvg] Evaluation started!


Rating:   0%|          | 0/200042 [00:00<?, ?it/s]


[MF] Training started!


  0%|          | 0/25 [00:00<?, ?it/s]

Optimization finished!

[MF] Evaluation started!


Rating:   0%|          | 0/200042 [00:00<?, ?it/s]


TEST:
...
          |    MAE |   RMSE | Train (s) | Test (s)
--------- + ------ + ------ + --------- + --------
GlobalAvg | 0.9418 | 1.0843 |    0.0001 |   3.0686
MF        | 0.6911 | 0.8489 |    0.4797 |   4.0341



In [46]:
import pandas as pd
pd.options.display.max_rows = 10

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table('./datasets/MovieLens1M/users.dat', sep='::',header=None, names=unames, engine='python')

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table('./datasets/MovieLens1M/ratings.dat', sep='::',header=None, names=rnames, engine='python')

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('./datasets/MovieLens1M/movies.dat', sep='::',header=None, names=mnames, engine='python', encoding='latin-1')

In [47]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [48]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [49]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


**Load cornac models**

In [None]:
import cornac
from cornac.eval_methods import RatioSplit
from cornac.models import MostPop, UserKNN, ItemKNN, MF, PMF, BPR, NeuMF, WMF, HPF, CVAE, VAECF, NMF
from cornac.metrics import MAE, RMSE, Precision, Recall, NDCG, AUC, MAP

ml_100k = cornac.datasets.movielens.load_feedback(variant="100K")
mf_model = MF(k=10, max_iter=25, learning_rate=0.01, lambda_reg=0.02, use_bias=True, seed=123)
vaecf_model = VAECF(k=10, autoencoder_structure=[20], act_fn="tanh", likelihood="mult",
                    n_epochs=100, batch_size=100, learning_rate=0.001,
                    beta=1.0, seed=123, use_gpu=True, verbose=True)
# convmf = cornac.models.ConvMF(n_epochs=5, verbose=True, seed=123)
hpf = cornac.models.HPF(k=5,seed=123)

Data from http://files.grouplens.org/datasets/movielens/ml-100k/u.data
will be cached into /root/.cornac/ml-100k/u.data


0.00B [00:00, ?B/s]

File cached!


**Fit models on movieLens dataset**

In [None]:
mf_model.fit(cornac.data.Dataset.from_uir(ml_100k))
vaecf_model.fit(cornac.data.Dataset.from_uir(ml_100k))
hpf.fit(cornac.data.Dataset.from_uir(ml_100k))

  0%|          | 0/100 [00:00<?, ?it/s]

Learning...
Learning completed!


<cornac.models.hpf.recom_hpf.HPF at 0x7f99589a92b0>

**Set Default Model to VAECF**

In [None]:
REC_MODEL = hpf

**Add 10 most rated items of each user to users dataframe**

In [None]:
def get_top_rank(model, user_idx):
  return sorted(list(enumerate(list(model.score(user_idx)))), key=lambda entry: entry[1])[-10:]

In [None]:
from collections import OrderedDict

top_ranks = OrderedDict()
for user_idx in range(len(users)):
  top_ranks[user_idx] = get_top_rank(model=REC_MODEL,user_idx=user_idx)

In [None]:
for i in range(10):
  top_rank_i = [top_ranks[i] for user_idx,top_ranks in top_ranks.items()]
  users[f'top_item_rec_{10-i}'] = top_rank_i

In [None]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,top_item_rec_10,top_item_rec_9,top_item_rec_8,top_item_rec_7,top_item_rec_6,top_item_rec_5,top_item_rec_4,top_item_rec_3,top_item_rec_2,top_item_rec_1
0,1,24,M,technician,85711,"(157, 0.7723271263304753)","(140, 0.8083460131843717)","(289, 0.8229038569562933)","(403, 0.8492584991192819)","(189, 0.8784724637141467)","(24, 0.9557791230818324)","(77, 0.9651263988003596)","(52, 0.9972043688387865)","(49, 1.087835370472427)","(357, 1.23732700543932)"
1,2,53,F,other,94043,"(24, 1.734229293430314)","(403, 1.8854881320126007)","(52, 1.9566243149946847)","(289, 2.0108796446376007)","(357, 2.059746434135098)","(60, 2.202347458984853)","(95, 2.304751459943746)","(719, 2.325810598980851)","(652, 2.4027525621185557)","(157, 2.567396864327809)"
2,3,23,M,writer,32067,"(201, 2.348181818848246)","(408, 2.3683929522663227)","(52, 2.4013931428571524)","(156, 2.451994615216075)","(102, 2.4581523459491534)","(357, 2.479380701772362)","(231, 2.566556285976035)","(347, 2.654817646168223)","(161, 2.7931368912319248)","(101, 2.8234194961046883)"
3,4,24,M,technician,43537,"(403, 4.385194023508565)","(24, 4.782620276071388)","(161, 4.850154219111494)","(31, 4.915006350252232)","(209, 4.938918582294495)","(49, 5.161245839690811)","(101, 5.204847192798675)","(216, 5.457915294228182)","(52, 5.724455259087394)","(357, 6.74437225820977)"
4,5,33,F,other,15213,"(280, 1.332012758696001)","(130, 1.3827808208424306)","(491, 1.3922983194069607)","(60, 1.4094121070659302)","(98, 1.4256553927622204)","(289, 1.5773244017850399)","(157, 1.702531196701317)","(1, 1.7050668359749828)","(652, 1.8711352302869544)","(719, 1.9188250646442773)"


**MAE**

In [None]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
rating_groups = ratings.groupby(by=['user_id'])

In [None]:
def get_rating_dict(user_id):
  user_ratings = rating_groups.get_group(user_id)
  return dict(list(zip(user_ratings['movie_id'] , user_ratings['rating'])))

In [None]:
def get_item_to_rec_score_mapping(rec_model , user_id):
  items, scores = rec_model.rank(user_id)
  return dict(list(zip(items, scores)))

In [None]:
def MAE(rec_model , user_id):
  item_to_rec_score_mapping = get_item_to_rec_score_mapping(rec_model , user_id)
  rating_dict = get_rating_dict(user_id)
  return sum([abs(item_to_rec_score_mapping[item] - rating) for item,rating in rating_dict.items() if item in item_to_rec_score_mapping]) / len(rating_dict)

In [None]:
mae_list = []
for user_id in users['user_id']:
  mae = None
  try:
    mae = MAE(REC_MODEL, user_id)
  except Exception:
    pass
  mae_list.append(mae)

In [None]:
users['MAE'] = mae_list

In [None]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,top_item_rec_10,top_item_rec_9,top_item_rec_8,top_item_rec_7,top_item_rec_6,top_item_rec_5,top_item_rec_4,top_item_rec_3,top_item_rec_2,top_item_rec_1,MAE
0,1,24,M,technician,85711,"(157, 0.7723271263304753)","(140, 0.8083460131843717)","(289, 0.8229038569562933)","(403, 0.8492584991192819)","(189, 0.8784724637141467)","(24, 0.9557791230818324)","(77, 0.9651263988003596)","(52, 0.9972043688387865)","(49, 1.087835370472427)","(357, 1.23732700543932)",3.266474
1,2,53,F,other,94043,"(24, 1.734229293430314)","(403, 1.8854881320126007)","(52, 1.9566243149946847)","(289, 2.0108796446376007)","(357, 2.059746434135098)","(60, 2.202347458984853)","(95, 2.304751459943746)","(719, 2.325810598980851)","(652, 2.4027525621185557)","(157, 2.567396864327809)",3.177943
2,3,23,M,writer,32067,"(201, 2.348181818848246)","(408, 2.3683929522663227)","(52, 2.4013931428571524)","(156, 2.451994615216075)","(102, 2.4581523459491534)","(357, 2.479380701772362)","(231, 2.566556285976035)","(347, 2.654817646168223)","(161, 2.7931368912319248)","(101, 2.8234194961046883)",2.085301
3,4,24,M,technician,43537,"(403, 4.385194023508565)","(24, 4.782620276071388)","(161, 4.850154219111494)","(31, 4.915006350252232)","(209, 4.938918582294495)","(49, 5.161245839690811)","(101, 5.204847192798675)","(216, 5.457915294228182)","(52, 5.724455259087394)","(357, 6.74437225820977)",4.221648
4,5,33,F,other,15213,"(280, 1.332012758696001)","(130, 1.3827808208424306)","(491, 1.3922983194069607)","(60, 1.4094121070659302)","(98, 1.4256553927622204)","(289, 1.5773244017850399)","(157, 1.702531196701317)","(1, 1.7050668359749828)","(652, 1.8711352302869544)","(719, 1.9188250646442773)",2.415287


**Protected Groups**

In [None]:
from enum import Enum

class AgeRange(Enum):
  kid = 'kid'
  young = 'young'
  adult = 'adult'
  old = 'old'

In [None]:
class Gender(Enum):
  male = 'male'
  female = 'female'

**ProtectedUtil**

In [None]:
class ProtectedUtil:
  def __init__(self, df):
    self.df = df

  def get_ids(self, id_key):
    return list(self.df[id_key])

  def filter_by_occupation(self , occupation):
    self.df = self.df[self.df['occupation'] == occupation]

  def filter_by_gender(self, gender):
    filtered_df = None
    if gender == Gender.male:
      filtered_df = self.df[self.df['sex'] == 'M'] 
    elif gender == Gender.female:
      filtered_df = self.df[self.df['sex'] == 'F']

    self.df = filtered_df

  def filter_by_age_range(self, age_range):
    filtered_df = None
    if age_range == AgeRange.kid:
      filtered_df = self.df[self.df['age'] < 18]
    elif age_range == AgeRange.young:
      filtered_df = self.df[(self.df['age'] >= 18) & (self.df['age'] < 25)]
    elif age_range == AgeRange.adult:
      filtered_df = self.df[(self.df['age'] >= 25) & (self.df['age'] < 35)]
    elif age_range == AgeRange.old:
      filtered_df = self.df[self.df['age'] >= 35]
    
    self.df = filtered_df

In [None]:
import itertools

age_ranges = [AgeRange.kid , AgeRange.young , AgeRange.adult , AgeRange.old]
genders = [Gender.male , Gender.female]
occupations = list(pd.unique(users['occupation']))


def gen_protected_groups():
  for age_range,gender,occupation in itertools.product(age_ranges , genders , occupations):
      p_u = ProtectedUtil(users)
      p_u.filter_by_age_range(age_range)
      p_u.filter_by_gender(gender)
      p_u.filter_by_occupation(occupation)
      
      yield {'age_range': age_range,
             'gender': gender,
             'occupation': occupation,
             'user_ids': p_u.get_ids('user_id')}

In [None]:
protected_group_gen = gen_protected_groups()

**Get Average Of MAE On set of users**

In [None]:
def MAE_batch(rec_model , user_ids):
  if len(user_ids) == 0:
    return None
  return sum([MAE(rec_model , user_id) for user_id in user_ids]) / len(user_ids)

**Compute MAE metric for each of protected groups**

In [None]:
protected_group_gen = gen_protected_groups()
mae_data = {}

while True:
  try:
    protected_group_data = next(protected_group_gen)
    
    mae_data[(protected_group_data['age_range'],
              protected_group_data['gender'],
              protected_group_data['occupation'])] = MAE_batch(rec_model=REC_MODEL,
                                                               user_ids = protected_group_data['user_ids'])
  except StopIteration:
    break

**MAX MAE GROUP**

In [None]:
max([entry for entry in mae_data.items() if entry[1] is not None], key=lambda entry: entry[1])

((<AgeRange.young: 'young'>, <Gender.female: 'female'>, 'writer'),
 4.0899388250724105)

**MIN MAE GROUP**

In [None]:
min([entry for entry in mae_data.items() if entry[1] is not None], key=lambda entry: entry[1])

((<AgeRange.young: 'young'>, <Gender.female: 'female'>, 'healthcare'),
 1.4860571413721544)

**RMSE**

In [None]:
import math

def RMSE(rec_model , user_id):
  item_to_rec_score_mapping = get_item_to_rec_score_mapping(rec_model , user_id)
  rating_dict = get_rating_dict(user_id)
  return math.sqrt(sum([(item_to_rec_score_mapping[item] - rating)**2 for item,rating in rating_dict.items() if item in item_to_rec_score_mapping]) / len(rating_dict))

In [None]:
rmse_list = []
for user_id in users['user_id']:
  rmse = None
  try:
    rmse = RMSE(REC_MODEL, user_id)
  except Exception:
    pass
  rmse_list.append(rmse)

In [None]:
users['RMSE'] = rmse_list

In [None]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code,top_item_rec_10,top_item_rec_9,top_item_rec_8,top_item_rec_7,top_item_rec_6,top_item_rec_5,top_item_rec_4,top_item_rec_3,top_item_rec_2,top_item_rec_1,MAE,RMSE
0,1,24,M,technician,85711,"(157, 0.7723271263304753)","(140, 0.8083460131843717)","(289, 0.8229038569562933)","(403, 0.8492584991192819)","(189, 0.8784724637141467)","(24, 0.9557791230818324)","(77, 0.9651263988003596)","(52, 0.9972043688387865)","(49, 1.087835370472427)","(357, 1.23732700543932)",3.266474,3.528124
1,2,53,F,other,94043,"(24, 1.734229293430314)","(403, 1.8854881320126007)","(52, 1.9566243149946847)","(289, 2.0108796446376007)","(357, 2.059746434135098)","(60, 2.202347458984853)","(95, 2.304751459943746)","(719, 2.325810598980851)","(652, 2.4027525621185557)","(157, 2.567396864327809)",3.177943,3.355227
2,3,23,M,writer,32067,"(201, 2.348181818848246)","(408, 2.3683929522663227)","(52, 2.4013931428571524)","(156, 2.451994615216075)","(102, 2.4581523459491534)","(357, 2.479380701772362)","(231, 2.566556285976035)","(347, 2.654817646168223)","(161, 2.7931368912319248)","(101, 2.8234194961046883)",2.085301,2.458474
3,4,24,M,technician,43537,"(403, 4.385194023508565)","(24, 4.782620276071388)","(161, 4.850154219111494)","(31, 4.915006350252232)","(209, 4.938918582294495)","(49, 5.161245839690811)","(101, 5.204847192798675)","(216, 5.457915294228182)","(52, 5.724455259087394)","(357, 6.74437225820977)",4.221648,4.315035
4,5,33,F,other,15213,"(280, 1.332012758696001)","(130, 1.3827808208424306)","(491, 1.3922983194069607)","(60, 1.4094121070659302)","(98, 1.4256553927622204)","(289, 1.5773244017850399)","(157, 1.702531196701317)","(1, 1.7050668359749828)","(652, 1.8711352302869544)","(719, 1.9188250646442773)",2.415287,2.77578


In [None]:
def RMSE_batch(rec_model , user_ids):
  if len(user_ids) == 0:
    return None
  return sum([RMSE(rec_model , user_id) for user_id in user_ids]) / len(user_ids)

In [None]:
protected_group_gen = gen_protected_groups()
rmse_data = {}

while True:
  try:
    protected_group_data = next(protected_group_gen)
    
    rmse_data[(protected_group_data['age_range'],
              protected_group_data['gender'],
              protected_group_data['occupation'])] = RMSE_batch(rec_model=REC_MODEL,
                                                               user_ids = protected_group_data['user_ids'])
  except StopIteration:
    break

**Max RMSE group**

In [None]:
max([entry for entry in rmse_data.items() if entry[1] is not None], key=lambda entry: entry[1])

((<AgeRange.young: 'young'>, <Gender.female: 'female'>, 'writer'),
 4.221165139103791)

**Min RMSE group**

In [None]:
min([entry for entry in rmse_data.items() if entry[1] is not None], key=lambda entry: entry[1])

((<AgeRange.young: 'young'>, <Gender.female: 'female'>, 'healthcare'),
 1.9347284731687096)

**MAP@K**

In [None]:
def get_k_most_rec(rec_model , user_id, k):
  item_to_rec_score_mapping = get_item_to_rec_score_mapping(rec_model , user_id)
  return [item for item,_ in sorted(item_to_rec_score_mapping.items(), key=lambda entry: entry[1])[-k:]]

In [None]:
def MAP_K(rec_model , user_id, k):
  most_rec_items = get_k_most_rec(rec_model , user_id, k)
  return most_rec_items

In [None]:
a = MAP_K(REC_MODEL, 1, 300)

In [None]:
user_ratings = rating_groups.get_group(1)

# len(user_ratings.groupby(by=['rating']).get_group(5))
b = user_ratings[user_ratings['rating']==5]['movie_id']
for item in a:
  if item in b:
    print(item) 
# list()

In [None]:
# from cornac.models import PMF

# # Load the MovieLens 100K dataset
# ml_100k = movielens.load_feedback()
# a
# # Instantiate an evaluation method.
# ratio_split = RatioSplit(data=ml_100k, test_size=0.2, rating_threshold=4.0, exclude_unknowns=False)

# # Instantiate a PMF recommender model.
# pmf = PMF(k=10, max_iter=100, learning_rate=0.001, lamda=0.001)

# # Instantiate evaluation metrics.
# mae = cornac.metrics.MAE()
# rmse = cornac.metrics.RMSE()
# rec_20 = cornac.metrics.Recall(k=20)
# pre_20 = cornac.metrics.Precision(k=20)

# # Instantiate and then run an experiment.
# exp = cornac.Experiment(eval_method=ratio_split,
#                         models=[pmf],
#                         metrics=[mae, rmse, rec_20, pre_20],
#                         user_based=True)
# exp.run()

**NDCG@K**

In [None]:
def NDCG_K(rec_model , user_id , k):
  most_rec_items = get_k_most_rec(rec_model , user_id , k)
  user_rated_movie_ids = set(rating_groups.get_group(user_id)['movie_id'])
  return len([item for item in most_rec_items if item in user_rated_movie_ids]) / len(most_rec_items)
  # user_item_ratings = rating_groups.get_group(user_id).groupby(by=['rating'])
  
  # rating_movies = {
  #     rating: list(user_item_ratings.get_group(rating)['movie_id'])
  #     for rating in range(1,6)
  # }
  
  # return rating_movies

In [None]:
def average(arr):
  return sum(arr) / len(arr)

In [None]:
K=20
average([NDCG_K(REC_MODEL, user_id, K) for user_id in users['user_id']])

0.1060445387062562

In [None]:
NDCG_K(REC_MODEL, 1, 20)

0.4