In [1]:
import pandas as pd
from datetime import datetime
from copy import deepcopy
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
import random
import os
import time

import mf
import ncf
import data
#from gmf import GMFEngine
#from mlp import MLPEngine
#from neumf import NeuMFEngine
#from data import SampleGenerator

In [2]:
# Read the reviews
ratings = pd.read_json('data/reviews.json',
                  lines=True,
                  dtype={'review_id':str,
                         'user_id':str,
                         'business_id':str,
                         'stars':int,
                         'date':str,
                         'useful':int,
                         'funny':int, 
                         'cool':int,
                         'text':str},
                  encoding='utf-8')
ratings = ratings[['business_id', 'user_id', 'stars', 'date']]
ratings.rename(columns={'business_id': 'item_id', 'stars': 'rating'}, inplace=True)
ratings.head()

Unnamed: 0,item_id,user_id,rating,date
0,qUWqjjjfpB2-4P3He5rsKw,TZQSUDDcA4ek5gBd6BzcjA,4,2017-06-06 19:04:10
1,9Eghhu_LzEJgDKNgisf3rg,iFEM16O5BeKvf_AHs6NzUQ,3,2016-08-25 16:25:12
2,4m_hApwQ054v3ue_OxFmGw,Eg_VTAQwK3CxYTORNRWd2g,5,2013-02-21 23:37:18
3,jM6JjZXFAQQANwdlEwbH3g,IIE2OX5haarGNefY23R8BA,5,2016-04-23 03:32:19
4,PMPMHHLe9vFzcR67iCjgLA,RRhco4irDXPk5EJmvuwr2w,5,2016-01-17 05:23:59


In [3]:
# Eliminate duplicate interactions
ratings.drop_duplicates(subset=['item_id', 'user_id'], inplace=True)

In [4]:
print('Number of items is {}'.format(len(ratings.item_id.unique())))
print('Number of users is {}'.format(len(ratings.user_id.unique())))
print('Number of interactions is {}'.format(len(ratings)))
print('Sparsity of the dataset is {}%'.format(100 * (1 - len(ratings) / len((ratings.item_id.unique())) / len(ratings.user_id.unique()))))

Number of items is 5462
Number of users is 90665
Number of interactions is 329018
Sparsity of the dataset is 99.93356020306688%


In [5]:
interact_user = ratings.groupby(['user_id']).size().reset_index(name='user_interactions')
interact_user.head()

Unnamed: 0,user_id,user_interactions
0,--7gjElmOrthETJ8XqzMBw,3
1,--BumyUHiO_7YsHurb9Hkw,38
2,--C93xIlmjtgQfSOIpcQSA,1
3,--DKDJlRHfsvufdGSk_Sdw,1
4,--EVSb3jbKVL3WJ5NUCuCA,1


In [6]:
interact_item = ratings.groupby(['item_id']).size().reset_index(name='item_interactions')
interact_item.sort_values(by=['item_interactions'], inplace = True)
interact_item.head()

Unnamed: 0,item_id,item_interactions
2368,R8OZCdiGzYm2Jq4-lPHbtg,3
887,9jJxXCuwku2l8arVGrBNTA,3
2161,Onnk1dC98D8w7g4UXoXxLQ,3
2162,Oo6VFjqP5c_uQs12l-lcXw,3
1768,Ju6_8n763pSltEiz5Tt-DQ,3


In [7]:
# Eliminate the users with insufficient information
ratings_cleaned = pd.merge(ratings, interact_user[['user_id', 'user_interactions']], on='user_id')
ratings_cleaned = pd.merge(ratings_cleaned, interact_item[['item_id', 'item_interactions']], on='item_id')
ratings_cleaned = ratings_cleaned.loc[(ratings_cleaned['user_interactions'] >= 5) & (ratings_cleaned['item_interactions'] >=1)]
ratings_cleaned = ratings_cleaned[['item_id', 'user_id', 'rating', 'date']]
ratings_cleaned.head()

Unnamed: 0,item_id,user_id,rating,date
0,qUWqjjjfpB2-4P3He5rsKw,TZQSUDDcA4ek5gBd6BzcjA,4,2017-06-06 19:04:10
1,qUWqjjjfpB2-4P3He5rsKw,BwwqlPVsJk1dbUyNwCWB7Q,4,2017-08-27 11:04:38
2,qUWqjjjfpB2-4P3He5rsKw,_t3BJzyGaqr9mcDazYiYAQ,4,2016-08-09 22:19:27
3,qUWqjjjfpB2-4P3He5rsKw,s-QuErK1oRlAzT5qnO6FOQ,4,2017-06-20 14:15:41
4,qUWqjjjfpB2-4P3He5rsKw,jz1_R_ilTsC-Kqc04oa3IA,2,2019-02-03 06:52:09


In [8]:
print('Number of restaurants is {}'.format(len(ratings_cleaned.item_id.unique())))
print('Number of users is {}'.format(len(ratings_cleaned.user_id.unique())))
print('Number of interactions is {}'.format(len(ratings_cleaned)))
print('Sparsity of the dataset is {}%'.format(100 * (1 - len(ratings) / len((ratings_cleaned.item_id.unique())) / len(ratings_cleaned.user_id.unique()))))

Number of restaurants is 5417
Number of users is 13651
Number of interactions is 210467
Sparsity of the dataset is 99.55506521947534%


In [9]:
# Convert business_id and user_id to int for convenience
ratings_cleaned.rename(columns={'item_id': 'item_id_original', 'user_id': 'user_id_original'}, inplace=True)
ratings_cleaned['item_id'] = pd.factorize(ratings_cleaned.item_id_original)[0]
ratings_cleaned['user_id'] = pd.factorize(ratings_cleaned.user_id_original)[0]
ratings_cleaned['date'] = ratings_cleaned['date'].apply(lambda x: pd.Timestamp(x))
# items_idx stores both the factorized indices and the original indices
items_idx = ratings_cleaned[['item_id', 'item_id_original']]
items_idx.drop_duplicates(subset=['item_id', 'item_id_original'], inplace=True)

ratings_cleaned = ratings_cleaned[['item_id', 'user_id', 'rating', 'date']]
ratings_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,item_id,user_id,rating,date
0,0,0,4,2017-06-06 19:04:10
1,0,1,4,2017-08-27 11:04:38
2,0,2,4,2016-08-09 22:19:27
3,0,3,4,2017-06-20 14:15:41
4,0,4,2,2019-02-03 06:52:09


In [73]:
mf_config = {'model': 'MF',
             'num_epoch': 100,
             'batch_size': 256,
             'optimizer': 'adam',
             'adam_lr': 1e-3,
             'num_users': len(ratings_cleaned.user_id.unique()),
             'num_items': len(ratings_cleaned.item_id.unique()),
             'latent_dim': 8,
             'num_negative': 4,
             'l2_regularization': 0.000001, # 0.01
             'use_cuda': True,
             'device_id': 0,
             'load_checkpoint': False}
ncf_config = {'model': 'NCF',
              'num_epoch': 100,
              'batch_size': 256,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': len(ratings_cleaned.user_id.unique()),
              'num_items': len(ratings_cleaned.item_id.unique()),
              'latent_dim_mf': 8,
              'latent_dim_mlp': 8,
              'num_negative': 4,
              'layers': [16,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
              'l2_regularization': 0.000001,
              'use_cuda': True,
              'device_id': 0,
              'load_checkpoint': False,
              'pretrained_ebdg': False, # weather use pretrained embedding from GMF
              }

In [11]:
sample_generator = data.SampleGenerator(ratings_cleaned)
# Generate test set
evaluate_data = sample_generator.evaluate_data
val_data = sample_generator.validation_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  ratings['rating'][ratings['rating'] > 0] = 1.0


In [None]:
# run classical matric factorization
config = mf_config
engine = mf.MFEngine(config)
# Load pretrained weights
if config['load_checkpoint']: 
    os.makedirs('checkpoints', exist_ok = True)
    checkpoint = torch.load('checkpoints/MF_ckpt.pth')
    engine.model.load_state_dict(checkpoint['model_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    engine.opt.load_state_dict(checkpoint['optimizer_state_dict'])
    HR_table = pd.read_csv('MF_HR_table.csv')
    best_HR = checkpoint['best_HR']
else: 
    start_epoch = 0
    HR_table = pd.DataFrame(columns = ['epoch', 'HR', 'NDCG'])
    best_HR = 0 # best_HR: 0.4449
    
best_HR = 0
for epoch in range(start_epoch, config['num_epoch']):
    start_time = time.time()
    print('Epoch {} starts !'.format(epoch))
    train_loader = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
    engine.train_an_epoch(train_loader, epoch_id=epoch)
    hit_ratio, ndcg = engine.evaluate(val_data, epoch_id=epoch)
    
    if epoch % 1 == 0:
        torch.save({'epoch': epoch,
                    'best_HR': best_HR, 
                    'model_state_dict': engine.model.state_dict(),
                    'optimizer_state_dict': engine.opt.state_dict(),
                    'loss': engine.loss},
                    f"checkpoints/MF_ckpt.pth")
        
    if hit_ratio > best_HR: 
        torch.save({'epoch': epoch,
                    'model_state_dict': engine.model.state_dict(),
                    'optimizer_state_dict': engine.opt.state_dict(),
                    'loss': engine.loss},
                    f"checkpoints/MF_best_ckpt.pth")
        best_HR = hit_ratio
    
    HR_table = HR_table.append({'epoch': epoch, 'HR': hit_ratio, 'NDCG': ndcg}, ignore_index=True)
    HR_table.to_csv('MF_HR_table.csv', index=False)
    print('Epoch {} ends, time: {}s, best HR: {}'.format(epoch, time.time()-start_time, best_HR))
    print('-' * 80)

In [None]:
# Run NCF 
config = ncf_config
engine = ncf.NCFEngine(config)
# Load pretrained weights
if config['load_checkpoint']:  
    os.makedirs('checkpoints', exist_ok = True)
    checkpoint = torch.load('checkpoints/NeuMF_ckpt.pth')
    engine.model.load_state_dict(checkpoint['model_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    engine.opt.load_state_dict(checkpoint['optimizer_state_dict'])
    HR_table = pd.read_csv('NeuMF_HR_table.csv')
    best_HR = checkpoint['best_HR']
else: 
    start_epoch = 0
    HR_table = pd.DataFrame(columns = ['epoch', 'HR', 'NDCG'])
    best_HR = 0
    
for epoch in range(start_epoch, config['num_epoch']):
    start_time = time.time()
    print('Epoch {} starts !'.format(epoch))
    train_loader = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
    engine.train_an_epoch(train_loader, epoch_id=epoch)
    hit_ratio, ndcg = engine.evaluate(val_data, epoch_id=epoch)
    
    if epoch % 1 == 0:
        torch.save({'epoch': epoch,
                    'best_HR': best_HR, 
                    'model_state_dict': engine.model.state_dict(),
                    'optimizer_state_dict': engine.opt.state_dict(),
                    'loss': engine.loss},
                    f"checkpoints/NeuMF_ckpt.pth")
    if hit_ratio > best_HR: 
        torch.save({'epoch': epoch,
                    'model_state_dict': engine.model.state_dict(),
                    'optimizer_state_dict': engine.opt.state_dict(),
                    'loss': engine.loss},
                    f"checkpoints/NeuMF_best_ckpt.pth")
        best_HR = hit_ratio
        
    HR_table = HR_table.append({'epoch': epoch, 'HR': hit_ratio, 'NDCG': ndcg}, ignore_index=True)
    HR_table.to_csv('NeuMF_HR_table.csv', index=False)
    print('Epoch {} ends, time: {}s, best HR: {}'.format(epoch, time.time()-start_time, best_HR))
    print('-' * 80)

In [75]:
# Load the best weights to GMF model *** run this cell or run the next cell, don't run both
config = mf_config
engine = mf.MFEngine(config)
best_checkpoint = torch.load('checkpoints/MF_best_ckpt.pth')
engine.model.load_state_dict(best_checkpoint['model_state_dict'])
engine.opt.load_state_dict(best_checkpoint['optimizer_state_dict'])

In [63]:
# Load the best weights to NCF model
config = ncf_config
engine = ncf.NCFEngine(config)
best_checkpoint = torch.load('checkpoints/NeuMF_best_ckpt.pth')
engine.model.load_state_dict(best_checkpoint['model_state_dict'])
engine.opt.load_state_dict(best_checkpoint['optimizer_state_dict'])

NCF(
  (embedding_user_mlp): Embedding(13651, 8)
  (embedding_item_mlp): Embedding(5417, 8)
  (embedding_user_mf): Embedding(13651, 8)
  (embedding_item_mf): Embedding(5417, 8)
  (fc_layers): ModuleList(
    (0): Linear(in_features=16, out_features=32, bias=True)
    (1): Linear(in_features=32, out_features=16, bias=True)
    (2): Linear(in_features=16, out_features=8, bias=True)
  )
  (affine_output): Linear(in_features=16, out_features=1, bias=True)
  (logistic): Sigmoid()
)


In [76]:
# Evaluate the performance of the best MF or NCF model on test set
_, _ = engine.evaluate(evaluate_data, epoch_id=epoch)

[Evluating Epoch 99] HR = 0.5177, NDCG = 0.2962


In [18]:
# Load the items, they will be used in demo
items_list = pd.read_json('data/business.json',
                  lines=True,
                  encoding='utf-8')
items_list.rename(columns={'business_id': 'item_id_original'}, inplace=True)
items_list = pd.merge(items_idx, items_list, on='item_id_original')
items_list = items_list[['item_id', 'name', 'categories']]
items_list.head()

Unnamed: 0,item_id,name,categories
0,0,Levetto,"Pizza, Italian, Salad, Restaurants"
1,1,Café La Gaffe,"French, Restaurants, Breakfast & Brunch"
2,2,Pizzeria Via Mercanti,"Pizza, Restaurants"
3,3,Niuda Hand-Pulled Noodles,"Restaurants, Chinese, Noodles"
4,4,Light Cafe,"Cafes, Restaurants"


In [66]:
test_users, test_items = evaluate_data[0], evaluate_data[1]
negative_users, negative_items = evaluate_data[2], evaluate_data[3]
if config['use_cuda'] is True:
    test_users = test_users.cuda()
    test_items = test_items.cuda()
    negative_users = negative_users.cuda()
    negative_items = negative_items.cuda()
test_scores = engine.model(test_users, test_items)
negative_scores = engine.model(negative_users, negative_items)
if config['use_cuda'] is True:
    test_users = test_users.cpu()
    test_items = test_items.cpu()
    test_scores = test_scores.cpu()
    negative_users = negative_users.cpu()
    negative_items = negative_items.cpu()
    negative_scores = negative_scores.cpu()
# Rank the scores of the test items of each user
engine._metron.subjects = [test_users.data.view(-1).tolist(),
                        test_items.data.view(-1).tolist(),
                        test_scores.data.view(-1).tolist(),
                        negative_users.data.view(-1).tolist(),
                        negative_items.data.view(-1).tolist(),
                        negative_scores.data.view(-1).tolist()]
scores = engine._metron._subjects
# Keep the top 10 recommendations only
scores = scores[scores['rank']<=10]
scores.rename(columns={'item': 'item_id'}, inplace=True)

In [21]:
pd.set_option('display.max_colwidth', -1)
user_id = 1
# get the last 10 reviews of selected user
user_review = ratings_cleaned[ratings_cleaned['user_id'] == user_id]
user_review.sort_values(by=['date'], inplace = True, ascending=False)
user_review = user_review.iloc[0:10]
user_review = pd.merge(user_review, items_list, on='item_id')[['item_id', 'name', 'categories']]
user_review

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,item_id,name,categories
0,18,Assembly Chef's Hall,"Japanese, Pizza, Coffee & Tea, Ramen, Barbeque, Food, Cafes, Food Court, Restaurants, Nightlife, Public Markets, Shopping, Vegan, Diners, Sushi Bars, Breakfast & Brunch, Bars"
1,108,Joy Thai Restaurant,"Seafood, Restaurants, Thai, Noodles"
2,106,Rudy,"Fast Food, Sandwiches, Burgers, Restaurants"
3,151,Little Sister,"Restaurants, Nightlife, Indonesian, Bars"
4,156,GB Hand-pulled Noodles,"Noodles, Chinese, Restaurants"
5,100,Siam Square Hut,"Thai, Asian Fusion, Caterers, Event Planning & Services, Restaurants"
6,155,The Parkdale Drink,"Nightlife, Lounges, Restaurants, Bars, Sushi Bars"
7,160,The Dime,"Nightlife, Burgers, Pubs, Restaurants, American (Traditional), Comfort Food, Bars"
8,96,Pho Cuu Long Mien Tay,"Restaurants, Vietnamese"
9,97,Ramen Isshin,"Noodles, Tapas/Small Plates, Food, Ramen, Japanese, Restaurants"


In [67]:
# get the top 10 recommendations of selected user
scores_user = scores[scores['user'] == user_id]
scores_user = pd.merge(scores_user, items_list, on='item_id')[['item_id', 'name', 'categories', 'score']]
scores_user

Unnamed: 0,item_id,name,categories,score
0,18,Assembly Chef's Hall,"Japanese, Pizza, Coffee & Tea, Ramen, Barbeque, Food, Cafes, Food Court, Restaurants, Nightlife, Public Markets, Shopping, Vegan, Diners, Sushi Bars, Breakfast & Brunch, Bars",0.830849
1,251,Salad King Restaurant,"Thai, Restaurants",0.778795
2,1866,Tabülè Middle Eastern Cuisine,"Middle Eastern, Restaurants",0.685031
3,1018,The Good Son,"Restaurants, Pizza, Bars, Canadian (New), Cocktail Bars, Nightlife",0.648689
4,1024,Luma,"Cocktail Bars, Religious Organizations, Lounges, Bars, Nightlife, American (New), Restaurants, Canadian (New), Wine Bars",0.632588
5,1088,Sushi On Bloor,"Restaurants, Sushi Bars",0.586555
6,444,Aji Sai Japanese Restaurant,"Japanese, Restaurants, Food, Buffets",0.543902
7,731,Peter Pan Bistro,"Mediterranean, Cocktail Bars, American (New), Canadian (New), French, Restaurants, Wine Bars, Nightlife, Bars, Bistros",0.526528
8,206,Rodney's Oyster House,"Seafood, Restaurants",0.515795
9,647,St Lawrence Market,"Farmers Market, Sandwiches, Restaurants, Grocery, Food",0.515549
