In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import dok_matrix
from tqdm import tqdm

In [2]:
data_dir = '/opt/ml/movie-recommendation/data/train/'
data = pd.read_csv(data_dir + 'train_ratings.csv')
data = data[['user', 'item']]

In [3]:
user_group_dfs = data.groupby(by='user')['item']

In [4]:
train_dfs = []
test_dfs  = []

items = set(data.loc[:, 'item'])

for u, u_items in tqdm(user_group_dfs):
    num_data = len(u_items)
    num_test = int(num_data*0.2)

    train_idx = np.random.choice(num_data, num_data-num_test, replace=False)
    test_idx = [idx for idx in range(num_data) if idx not in train_idx]

    train_df = pd.DataFrame({'user':[u]*len(train_idx), 'item':u_items.values[train_idx]})
    test_df = pd.DataFrame({'user':[u]*len(test_idx), 'item':u_items.values[test_idx], 'rating': [1]*len(test_idx)})

    num_negs = len(test_idx)*2 if len(test_idx) >= 10 else 10
    neg_items = np.random.choice(list(items - set(u_items)), num_negs, replace=False)
    neg_df = pd.DataFrame({'user': [u]*num_negs, 'item': neg_items, 'rating': [0]*num_negs})

    train_dfs.append(train_df)
    test_dfs.extend([test_df, neg_df])

train_df = pd.concat(train_dfs)
test_df = pd.concat(test_dfs)

100%|██████████| 31360/31360 [01:41<00:00, 309.15it/s]


In [5]:
data_dir = '/opt/ml/movie-recommendation/data/train/bpr/'
train_df.to_csv(data_dir+'train.csv')
test_df.to_csv(data_dir+'valid.csv')

In [6]:
train_df = pd.read_csv(data_dir+'train.csv')
valid_df = pd.read_csv(data_dir+'valid.csv')

In [7]:
class BPRDataset(Dataset):
	def __init__(self, data_path, num_negative=5, is_training=True, train_matrix=None):
		super(BPRDataset, self).__init__()
		
		self.data = pd.read_csv(data_path)

		if is_training :
			self.data = self.data[['user', 'item']].sort_values(by=['user'])
		else :
			self.data = self.data[['user', 'item', 'rating']].sort_values(by=['user'])
		
		self.train_matrix = train_matrix

		self.zero_based_mapping()
		if self.train_matrix == None:
			self.get_sparse_matrix()

		self.num_negative = num_negative
		self.is_training = is_training
		self.features = self.data.values

	def negative_sampling(self):
		assert self.is_training, 'no need to sampling when testing'

		negative_samples = []
		
		for u, i in tqdm(self.data.values):
			for _ in range(self.num_negative):
				j = np.random.randint(self.n_item)
				while (u, j) in self.train_matrix:
					j = np.random.randint(self.n_item)
				negative_samples.append([u, i, j])
		
		self.features = negative_samples

	def __len__(self):
		return self.num_negative * len(self.data) if \
				self.is_training else len(self.data)

	def __getitem__(self, idx):
		user = self.features[idx][0]
		item_i = self.features[idx][1]
		item_j = self.features[idx][2] if \
				self.is_training else self.features[idx][1]
		return user, item_i, item_j 
	
	def zero_based_mapping(self) :
		users = list(set(self.data.loc[:,'user']))
		items =  list(set(self.data.loc[:, 'item']))

		self.n_user = len(users)
		self.n_item = len(items)

		# user, item을 zero-based index로 mapping
		if self.n_user-1 != max(users):
			users_dict = {users[i]: i for i in range(len(users))}
			self.data['user']  = self.data['user'].map(lambda x : users_dict[x])

		if self.n_item-1 != max(items):
			items_dict = {items[i]: i for i in range(len(items))}
			self.data['item']  = self.data['item'].map(lambda x : items_dict[x])
	
	def get_sparse_matrix(self):
		train_matrix = dok_matrix((self.n_user, self.n_item), dtype=np.float32)
		for u, i in tqdm(self.data.values):
			train_matrix[u, i] = 1.0
		self.train_matrix = train_matrix

In [8]:
class BPR(nn.Module):
	def __init__(self, user_num, item_num, factor_num):
		super(BPR, self).__init__()
		"""
		user_num: number of users;
		item_num: number of items;
		factor_num: number of predictive factors.
		"""		
		self.embed_user = nn.Embedding(user_num, factor_num)
		self.embed_item = nn.Embedding(item_num, factor_num)

		nn.init.normal_(self.embed_user.weight, std=0.01)
		nn.init.normal_(self.embed_item.weight, std=0.01)

	def forward(self, user, item_i, item_j):
		user = self.embed_user(user)
		item_i = self.embed_item(item_i)
		item_j = self.embed_item(item_j)

		prediction_i = (user * item_i).sum(dim=-1)
		prediction_j = (user * item_j).sum(dim=-1)
		return prediction_i, prediction_j

In [9]:
data_dir = '/opt/ml/movie-recommendation/data/train/bpr/'
train_dataset = BPRDataset(data_dir + 'train.csv')
valid_dataset = BPRDataset(data_dir + 'valid.csv', is_training=False, train_matrix=train_dataset.train_matrix, )

100%|██████████| 4136075/4136075 [00:47<00:00, 87190.27it/s]


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BPR(train_dataset.n_user, train_dataset.n_item, 7).to(device)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=1024, shuffle=False, num_workers=4)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [12]:
def recall_at_k(actual, predicted, topk):
    sum_recall = 0.0
    num_users = len(predicted)
    true_users = 0
    for i in range(num_users):
        act_set = set(actual[i])
        pred_set = set(predicted[i][:topk])
        if len(act_set) != 0:
            sum_recall += len(act_set & pred_set) / float(len(act_set))
            true_users += 1
    return sum_recall / true_users

In [31]:
for epoch in range(1):
	model.train() 
	train_loader.dataset.negative_sampling()

	for user, item_i, item_j in train_loader:
		user = user.to(device)
		item_i = item_i.to(device)
		item_j = item_j.to(device)

		model.zero_grad()
		prediction_i, prediction_j = model(user, item_i, item_j)
		loss =- (prediction_i - prediction_j).sigmoid().log().sum()
		loss.backward()
		optimizer.step()

	model.eval()

	all_preds = []

	for user, item_i, item_j in valid_loader:
		user = user.to(device)
		item_i = item_i.to(device)
		item_j = item_j.to(device)

		prediction_i, prediction_j = model(user, item_i, item_j)
		
		all_preds.append(prediction_i)
	
	all_preds = torch.cat(all_preds).detach().cpu().numpy()
	valid_dataset.data['preds'] = all_preds

	user_group_dfs = list(valid_dataset.data.groupby(by='user'))

	predicted = []
	actual = []
	for user, user_df in user_group_dfs :
		recommends = np.array(user_df.nlargest(10, ['preds'])['item'])
		predicted.append(recommends)
		ground_truth = np.array(user_df[user_df['rating'] == 1]['item'])
		actual.append(ground_truth)
	print(recall_at_k(actual, predicted, 10))


0.17522156432571986
