In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import dok_matrix
from tqdm import tqdm

In [2]:
class BPR(nn.Module):
	def __init__(self, user_num, item_num, factor_num):
		super(BPR, self).__init__()
		"""
		user_num: number of users;
		item_num: number of items;
		factor_num: number of predictive factors.
		"""		
		self.embed_user = nn.Embedding(user_num, factor_num)
		self.embed_item = nn.Embedding(item_num, factor_num)

		nn.init.normal_(self.embed_user.weight, std=0.01)
		nn.init.normal_(self.embed_item.weight, std=0.01)

	def forward(self, user, item_i, item_j):
		user = self.embed_user(user)
		item_i = self.embed_item(item_i)
		item_j = self.embed_item(item_j)

		prediction_i = (user * item_i).sum(dim=-1)
		prediction_j = (user * item_j).sum(dim=-1)
		return prediction_i, prediction_j

In [3]:
class BPRDataset(Dataset):
	def __init__(self, data_path, num_negative=5, is_training=True):
		super(BPRDataset, self).__init__()
		
		self.train_df = pd.read_csv(data_path)
		self.train_df = self.train_df[['user', 'item']].sort_values(by=['user'])

		self.zero_based_mapping()
		self.get_sparse_matrix()

		self.num_negative = num_negative
		self.is_training = is_training
		self.features = self.train_df.values

	def negative_sampling(self):
		assert self.is_training, 'no need to sampling when testing'

		negative_samples = []
		
		for u, i in tqdm(self.train_df.values):
			for _ in range(self.num_negative):
				j = np.random.randint(self.n_item)
				while (u, j) in self.train_matrix:
					j = np.random.randint(self.n_item)
				negative_samples.append([u, i, j])
		
		self.features = negative_samples

	def __len__(self):
		return self.num_negative * len(self.train_df) if \
				self.is_training else len(self.train_df)

	def __getitem__(self, idx):
		user = self.features[idx][0]
		item_i = self.features[idx][1]
		item_j = self.features[idx][2] if \
				self.is_training else self.features[idx][1]
		return user, item_i, item_j 
	
	def zero_based_mapping(self) :
		users = list(set(self.train_df.loc[:,'user']))
		items =  list(set(self.train_df.loc[:, 'item']))

		self.n_user = len(users)
		self.n_item = len(items)

		# user, item을 zero-based index로 mapping
		if self.n_user-1 != max(users):
			users_dict = {users[i]: i for i in range(len(users))}
			self.train_df['user']  = self.train_df['user'].map(lambda x : users_dict[x])

		if self.n_item-1 != max(items):
			items_dict = {items[i]: i for i in range(len(items))}
			self.train_df['item']  = self.train_df['item'].map(lambda x : items_dict[x])
	
	def get_sparse_matrix(self):
		train_matrix = dok_matrix((self.n_user, self.n_item), dtype=np.float32)
		for u, i in tqdm(self.train_df.values):
			train_matrix[u, i] = 1.0
		self.train_matrix = train_matrix

In [4]:
data_dir = '/opt/ml/movie-recommendation/data/train/'
train_dataset = BPRDataset(data_dir + 'train_ratings.csv')
test_dataset = BPRDataset(data_dir + 'train_ratings.csv', is_training=False)

100%|██████████| 5154471/5154471 [00:56<00:00, 90957.07it/s]
100%|██████████| 5154471/5154471 [00:56<00:00, 91138.10it/s]


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BPR(train_dataset.n_user, train_dataset.n_item, 7).to(device)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=4)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [6]:
for epoch in range(1):
	model.train() 
	train_loader.dataset.negative_sampling()

	for user, item_i, item_j in train_loader:
		user = user.to(device)
		item_i = item_i.to(device)
		item_j = item_j.to(device)

		model.zero_grad()
		prediction_i, prediction_j = model(user, item_i, item_j)
		loss =- (prediction_i - prediction_j).sigmoid().log().sum()
		loss.backward()
		optimizer.step()

100%|██████████| 5154471/5154471 [02:05<00:00, 41145.06it/s]
