In [1]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
from model import NeuMF

from ast import arg
import os
import argparse
import pandas as pd
import numpy as np

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument('--gpu_id',
    type=int,
    default=0 if torch.cuda.is_available() else -1,
    help="gpu id")
parser.add_argument("--seed", 
    type=int, 
    default=42, 
    help="Seed")
parser.add_argument("--lr", 
    type=float, 
    default=0.001, 
    help="learning rate")
parser.add_argument("--dropout", 
    type=float,
    default=0.2,  
    help="dropout rate")
parser.add_argument("--batch_size", 
    type=int, 
    default=256, 
    help="batch size for training")
parser.add_argument("--top_k", 
    type=int, 
    default=10, 
    help="compute metrics@top_k")
parser.add_argument("--factor_num", 
    type=int,
    default=32, 
    help="predictive factors numbers in the model")
parser.add_argument("--layers",
    nargs='+', 
    default=[64,32,16,8],
    help="MLP layers. Note that the first layer is the concatenation of user and item embeddings. So layers[0]/2 is the embedding size.")
parser.add_argument("--num_ng", 
    type=int,
    default=4, 
    help="Number of negative samples for training set")
parser.add_argument("--num_ng_test", 
    type=int,
    default=100, 
    help="Number of negative samples for test set")
    
parser.add_argument("--DATA_PATH", 
    default='/opt/ml/input/data/train/train_ratings.csv',
    help="Data path")
parser.add_argument("--MODEL_PATH", 
    default='/opt/ml/NCF/models/',
    help="Model path")
parser.add_argument("--OUT_PATH", 
    default='/opt/ml/NCF/out_csv/',
    help="output csv file path")
parser.add_argument("--OUT_CSV_FILE", 
    default='output',
    help="output csv file path")
parser.add_argument("--MODEL", 
    default='PRE_NCF3',
    help="Model name")


args = parser.parse_args([])

In [4]:
from util import seed_everything

device = torch.device('cpu') if args.gpu_id < 0 else torch.device('cuda:%d' % args.gpu_id)   

# seed for Reproducibility
seed_everything(args.seed)

In [5]:
train_df = pd.read_csv(args.DATA_PATH)
train_df.columns = ['user_id', 'item_id', 'timestamp']

In [21]:
import random

class NCF_Data(object):
	"""
	Construct Dataset for NCF
	"""
	def __init__(self, args, ratings):
		self.ratings = ratings
		self.num_ng = args.num_ng
		self.num_ng_test = args.num_ng_test
		self.batch_size = args.batch_size

		self.preprocess_ratings = self._reindex(self.ratings)

		self.user_pool = set(self.ratings['user_id'].unique())
		self.item_pool = set(self.ratings['item_id'].unique())

		self.train_ratings, self.test_ratings = self._leave_one_out(self.preprocess_ratings)
		self.negatives = self._negative_sampling(self.preprocess_ratings)
		random.seed(args.seed)
	
	def _reindex(self, ratings):
		"""
		Process dataset to reindex userID and itemID, also set rating as binary feedback
		"""
		user_list = list(ratings['user_id'].drop_duplicates())
		user2id = {w: i for i, w in enumerate(user_list)}

		item_list = list(ratings['item_id'].drop_duplicates())
		item2id = {w: i for i, w in enumerate(item_list)}

		ratings['user_id'] = ratings['user_id'].apply(lambda x: user2id[x])
		ratings['item_id'] = ratings['item_id'].apply(lambda x: item2id[x])
		ratings['rating'] = ratings['timestamp'].apply(lambda x: float(x > 0))

		return ratings

	def _leave_one_out(self, ratings):
		"""
		leave-one-out evaluation protocol in paper https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf
		"""
		ratings['rank_latest'] = ratings.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)
		test = ratings.loc[ratings['rank_latest'] == 1]
		train = ratings.loc[ratings['rank_latest'] > 1]

		assert train['user_id'].nunique()==test['user_id'].nunique(), 'Not Match Train User with Test User'
		
		return train[['user_id', 'item_id', 'rating']], test[['user_id', 'item_id', 'rating']]
	

	def _negative_sampling(self, ratings):
		interact_status = (
			ratings.groupby('user_id')['item_id']
			.apply(set)
			.reset_index()
			.rename(columns={'item_id': 'interacted_items'}))
		interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
		interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, self.num_ng_test))

		return interact_status[['user_id', 'negative_items', 'negative_samples']]

In [22]:
dataprocess = NCF_Data(args, train_df)

In [23]:
out_reindex=dataprocess._reindex(train_df)

In [24]:
user_pool = set(train_df['user_id'].unique())
item_pool = set(train_df['item_id'].unique())

In [25]:
train_ratings, test_ratings = dataprocess._leave_one_out(out_reindex)

In [26]:
negatives = dataprocess._negative_sampling(out_reindex)

In [12]:
negatives

Unnamed: 0,user_id,negative_items,negative_samples
0,0,"{376, 377, 378, 379, 380, 381, 382, 383, 384, ...","[5614, 1288, 580, 6450, 2629, 2382, 2204, 1519..."
1,1,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[3966, 3288, 2391, 5422, 5817, 4742, 1979, 578..."
2,2,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[6455, 4421, 4, 4983, 2732, 4079, 161, 993, 30..."
3,3,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[897, 540, 3389, 6056, 2870, 6649, 986, 2127, ..."
4,4,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[2727, 2109, 2329, 3396, 1226, 5656, 5441, 261..."
...,...,...,...
31355,31355,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[113, 2501, 6442, 6738, 2652, 6060, 5772, 3427..."
31356,31356,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[2316, 2213, 6741, 4895, 1269, 4279, 2297, 424..."
31357,31357,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[2417, 5686, 52, 2438, 5635, 6164, 1556, 1769,..."
31358,31358,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1, 5272, 5130, 5689, 5154, 4211, 6217, 3341, ..."


In [13]:
from util import seed_everything

device = torch.device('cpu') if args.gpu_id < 0 else torch.device('cuda:%d' % args.gpu_id)   

# seed for Reproducibility
seed_everything(args.seed)

train_df = pd.read_csv(args.DATA_PATH)
train_df.columns = ['user_id', 'item_id', 'timestamp']

# set the num_users, items
num_users = train_df['user_id'].nunique()
num_items = train_df['item_id'].nunique()

user_list = list(train_df['user_id'].drop_duplicates())
user2id = {w: i for i, w in enumerate(user_list)}
id2user = {i: w for i, w in enumerate(user_list)}

item_list = list(train_df['item_id'].drop_duplicates())
item2id = {w: i for i, w in enumerate(item_list)}
id2item = {i: w for i, w in enumerate(item_list)}

In [14]:
train_df['user_id'] = train_df['user_id'].apply(lambda x: user2id[x])
train_df['item_id'] = train_df['item_id'].apply(lambda x: item2id[x])

In [15]:
train_df

Unnamed: 0,user_id,item_id,timestamp
0,0,0,1230782529
1,0,1,1230782534
2,0,2,1230782539
3,0,3,1230782542
4,0,4,1230782563
...,...,...,...
5154466,31359,423,1260209449
5154467,31359,1491,1260209482
5154468,31359,331,1260209720
5154469,31359,733,1260209726


In [16]:
train_df['rating'] = train_df['timestamp'].apply(lambda x: float(x > 0))  

In [19]:
result = np.zeros((2, args.top_k * num_users), dtype=np.int64) 

interact_status = (
    train_df.groupby('user_id')['item_id']
    .apply(list)
    .reset_index()
    .rename(columns={'item_id': 'interacted_items'}))

In [20]:
eval_user = torch.ones(num_items, dtype=torch.long)
eval_items = torch.arange(0, num_items, dtype=torch.long)

In [59]:
# set model
input_path = os.path.join(args.MODEL_PATH, args.MODEL)
model = torch.load(input_path).to(device)

#model = NeuMF(args, num_users, num_items)
#model.load_state_dict(torch.load('{}{}.pth'.format(args.MODEL_PATH, args.MODEL)))
#model = model.to(device)

model.eval()

NeuMF(
  (embedding_user_mlp): Embedding(31360, 32)
  (embedding_item_mlp): Embedding(6807, 32)
  (embedding_user_mf): Embedding(31360, 32)
  (embedding_item_mf): Embedding(6807, 32)
  (fc_layers): ModuleList(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=8, bias=True)
    (5): ReLU()
  )
  (affine_output): Linear(in_features=40, out_features=1, bias=True)
  (logistic): Sigmoid()
)

In [60]:
with torch.no_grad():
    for user in range(num_users):
        users = (user * eval_user).to(device)
        items = eval_items.to(device)

        prediction = model(users, items)
        sorted_items = prediction.argsort(descending=True)
        positive_samples = interact_status.iloc[user]['interacted_items']

        rec_items = np.setdiff1d(np.array(sorted_items.to('cpu')), positive_samples)

        result[0, 10*user:10*(user+1)] = user
        result[1, 10*user:10*(user+1)] = rec_items[:args.top_k]

In [61]:
test_df = pd.DataFrame()
test_df['user_id'] = result[0].T
test_df['item_id'] = result[1].T

In [62]:
test_df

Unnamed: 0,user_id,item_id
0,0,376
1,0,377
2,0,378
3,0,379
4,0,380
...,...,...
313595,31359,7
313596,31359,8
313597,31359,9
313598,31359,10


In [65]:

test_df['user'] = test_df['user_id'].apply(lambda x: id2user[x])
test_df['item'] = test_df['item_id'].apply(lambda x: id2item[x])

del test_df['item_id'], test_df['user_id'] 

test_df.to_csv('{}{}.csv'.format(args.OUT_PATH, args.OUT_CSV_FILE), index=False)

In [66]:
test_df

Unnamed: 0,user,item
0,11,8961
1,11,1396
2,11,471
3,11,1042
4,11,1947
...,...,...
313595,138493,2688
313596,138493,2428
313597,138493,3113
313598,138493,1591


In [None]:
from util import seed_everything

device = torch.device('cpu') if args.gpu_id < 0 else torch.device('cuda:%d' % args.gpu_id)   

# seed for Reproducibility
seed_everything(args.seed)

train_df = pd.read_csv(args.DATA_PATH)
train_df.columns = ['user_id', 'item_id', 'timestamp']

# set the num_users, items
num_users = train_df['user_id'].nunique()
num_items = train_df['item_id'].nunique()

user_list = list(train_df['user_id'].drop_duplicates())
user2id = {w: i for i, w in enumerate(user_list)}

item_list = list(train_df['item_id'].drop_duplicates())
item2id = {w: i for i, w in enumerate(item_list)}

train_df['user_id'] = train_df['user_id'].apply(lambda x: user2id[x])
train_df['item_id'] = train_df['item_id'].apply(lambda x: item2id[x])
train_df['rating'] = train_df['timestamp'].apply(lambda x: float(x > 0))  

result = np.zeros((2, args.top_k * num_users), dtype=np.int64) 

interact_status = (
    train_df.groupby('user_id')['item_id']
    .apply(list)
    .reset_index()
    .rename(columns={'item_id': 'interacted_items'}))

eval_user = torch.ones(num_items)
eval_items = torch.arange(0, num_items, dtype=torch.float32)

# set model
input_path = os.path.join(args.MODEL_PATH, args.MODEL)
model = torch.load(input_path).to(device)

#model = NeuMF(args, num_users, num_items)
#model.load_state_dict(torch.load('{}{}.pth'.format(args.MODEL_PATH, args.MODEL)))
#model = model.to(device)

model.eval()

with torch.no_grad():
    for user in range(num_users):
        users = (user * eval_user).to(device)
        items = eval_items.to(device)

        prediction = model(users, items)
        sorted_items = prediction.argsort(descending=True)
        positive_samples = interact_status.iloc[user]['interacted_items']

        rec_items = np.setdiff1d(np.array(sorted_items.to('cpu')), positive_samples)

        result[0, 10*user:10*(user+1)] = user
        result[1, 10*user:10*(user+1)] = rec_items[:args.top_k]

test_df = pd.DataFrame()
test_df['user_id'] = result[0].T
test_df['item_id'] = result[1].T

test_df['user'] = test_df['user_id'].apply(lambda x: user2id[user2id==x].keys()[0])
test_df['item'] = test_df['item_id'].apply(lambda x: item2id[item2id==x].keys()[0])

del test_df['item_id'], test_df['user_id'] 

test_df.to_csv('{}{}.csv'.format(args.OUT_PATH, args.OUT_CSV_FILE), index=False)