In [1]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd
import numpy as np
from scipy.sparse import dok_matrix
from scipy import sparse
import torch.optim as optim
from tqdm import tqdm
import os
from importlib import import_module

In [2]:
def preprocess(df) :
    print("preprocessing..")
    df = df.sort_values(['user', 'time'], ascending = [True, True])

    users = df['user'].unique()
    user_to_id = dict(zip(users, range(len(users))))
    id_to_user = {v: k for k, v in user_to_id.items()}
    
    movies = df['item'].unique()
    movie_to_id = dict(zip(movies, range(len(movies))))
    id_to_movie = {v: k for k, v in movie_to_id.items()}
    
    df['user'] = df['user'].apply(lambda x : user_to_id[x])
    df['item'] = df['item'].apply(lambda x : movie_to_id[x])

    return df, user_to_id, id_to_user, movie_to_id, id_to_movie

In [3]:
data_dir = '/opt/ml/movie-recommendation/data/train/'

df = pd.read_csv(data_dir+'train_ratings.csv')
df, user_to_id, id_to_user, movie_to_id, id_to_movie = preprocess(df)
df

preprocessing..


Unnamed: 0,user,item,time
0,0,0,1230782529
1,0,1,1230782534
2,0,2,1230782539
3,0,3,1230782542
4,0,4,1230782563
...,...,...,...
5154466,31359,423,1260209449
5154467,31359,1491,1260209482
5154468,31359,331,1260209720
5154469,31359,733,1260209726


In [4]:
class FPMC(nn.Module):
    def __init__(self, user_num, item_num, factor_num):
        super(FPMC, self).__init__()	
        self.embed_UI = nn.Embedding(user_num, factor_num)
        self.embed_IU = nn.Embedding(item_num, factor_num)
        self.embed_LI = nn.Embedding(item_num+1, factor_num, padding_idx=0)
        self.embed_IL = nn.Embedding(item_num, factor_num)
        
        nn.init.normal_(self.embed_UI.weight, std=0.01)
        nn.init.normal_(self.embed_IU.weight, std=0.01)
        nn.init.normal_(self.embed_LI.weight, std=0.01)
        nn.init.normal_(self.embed_IL.weight, std=0.01)
        
    def forward(self, user, item, item_seq, seq_len):
        VUI = self.embed_UI(user) # (batch_size, factor_num)
        VIU = self.embed_IU(item) # (batch_size, factor_num)
        VLI = self.embed_LI(item_seq) # (batch_size, sequence_len, factor_num)
        VIL = self.embed_IL(item) # (batch_size, factor_num)
        # (batch_size, seq_len,)

        VUI_m_VIU = torch.sum(VUI*VIU, axis=1)
        VLI_m_VIL = torch.sum(torch.bmm(VLI, (VIL.unsqueeze(2))), axis=1) / seq_len.unsqueeze(1)

        return VUI_m_VIU + VLI_m_VIL.squeeze()
    
    def predict(self, user, items, item_seq, seq_len):
        VUI = self.embed_UI(user) # (batch_size, factor_num)
        VIU = self.embed_IU(items) # (batch_size, item_num, factor_num)
        VLI = self.embed_LI(item_seq) # (batch_size, sequence_len, factor_num)
        VIL = self.embed_IL(items) # (batch_size, item_num, factor_num)
        # (batch_size, seq_len,)

        VUI_m_VIU = torch.bmm(VIU, VUI.unsqueeze(2))
        VLI_m_VIL = torch.sum(torch.bmm(VIL, VLI.transpose(1,2)), axis=2) / seq_len.unsqueeze(1)

        return VUI_m_VIU.squeeze() + VLI_m_VIL

In [6]:
class SequentialDataset(Dataset):
    def __init__(self, data, num_negative=10, is_training=False) :
        self.data = data[['user', 'item']]
        self.n_user = self.data['user'].nunique() 
        self.n_item = self.data['item'].nunique()
        self.num_negative = num_negative
        self.is_training = is_training
        
        self.user2seq = dict()
        user_item_sequence = list(self.data.groupby(by='user')['item'])
        for user, item_seq in user_item_sequence :
            self.user2seq[user] = list(item_seq)
        
        if not self.is_training :
            self.users = list(self.user2seq.keys())
            self.item_seqs = list(self.user2seq.values())

    def negative_sampling(self):
        assert self.is_training, 'no need to sampling when testing'
        negative_samples = []
        
        for u, i in self.data.values:
            for _ in range(self.num_negative):
                j = np.random.randint(self.n_item)
                while j in self.user2seq[u]:
                    j = np.random.randint(self.n_item)
                negative_samples.append([u, i, j])
        self.features = negative_samples

    def __len__(self):
        return self.num_negative * len(self.data) if self.is_training else self.n_user
    
    def __getitem__(self, idx):
        return {"user":torch.tensor(self.features[idx][0]), 
                "pos_item": torch.tensor(self.features[idx][1]),
                "neg_item": torch.tensor(self.features[idx][2]),
                "item_seq": torch.tensor(
                    list(set(self.user2seq[self.features[idx][0]]) - \
                    set([self.features[idx][1]]))
                ),
                "seq_len": torch.tensor(len(self.user2seq[self.features[idx][0]])-1),}\
                if self.is_training else \
                {"user":torch.tensor(self.users[idx]),
                "pos_item": torch.arange(0,self.n_item),
                "neg_item": torch.tensor([0]),
                "item_seq": torch.tensor(self.item_seqs[idx]),
                "seq_len": torch.tensor(len(self.item_seqs[idx])),}

In [5]:
def make_batch(samples):
    users = [sample['user'] for sample in samples]
    pos_items = [sample['pos_item'] for sample in samples]
    neg_items = [sample['neg_item'] for sample in samples]
    seq_lens = [sample['seq_len']+1 for sample in samples]
    item_seqs = [sample['item_seq'] for sample in samples]

    padded_item_seqs = torch.nn.utils.rnn.pad_sequence(item_seqs, batch_first=True)
    return {'user': torch.stack(users).contiguous(),
            'pos_item': torch.stack(pos_items).contiguous(),
            'neg_item': torch.stack(neg_items).contiguous(),
            'item_seq': padded_item_seqs.contiguous(),
            'seq_len': torch.stack(seq_lens).contiguous()}

In [8]:
train_dataset = SequentialDataset(df, num_negative=10, is_training=True)
train_loader = DataLoader(
    train_dataset,
    batch_size=1024,
    shuffle=True, 
    drop_last=True,
    collate_fn=make_batch,
    pin_memory=True,
    num_workers=4
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FPMC(train_dataset.n_user, train_dataset.n_item, 10).to(device)
model = torch.nn.DataParallel(model)
optimizer = optim.Adam(model.parameters(), lr=0.001) 

In [11]:
epochs = 1
log_interval = 500

for epoch in range(epochs):
	model.train()
	loss_value = 0

	print('Negative Sampling ...', end=' ')
	train_loader.dataset.negative_sampling()
	print('Done!!')

	for idx, batch in enumerate(train_loader):
		user, pos_item, neg_item, item_seq, seq_len =\
    		(v.to(device) for _,v in batch.items())

		model.zero_grad()
		prediction_i = model(user, pos_item, item_seq, seq_len)
		prediction_j = model(user, neg_item, item_seq, seq_len)
		loss =- (prediction_i - prediction_j).sigmoid().log().sum()
		loss.backward()
		optimizer.step()

		loss_value += loss.item()

		if (idx + 1) % log_interval == 0:
			train_loss = loss_value / log_interval
			print(
				f"Epoch[{epoch}/{epochs}]({idx + 1}/{len(train_loader)}) || "
				f"training loss {train_loss:4.7} ||"
			)
			loss_value = 0


Negative Sampling ... Done!!
Epoch[0/10](500/50336) || training loss 498.909 ||
Epoch[0/10](1000/50336) || training loss 310.1986 ||
Epoch[0/10](1500/50336) || training loss 296.0072 ||
Epoch[0/10](2000/50336) || training loss 291.3797 ||
Epoch[0/10](2500/50336) || training loss 283.443 ||
Epoch[0/10](3000/50336) || training loss 277.427 ||
Epoch[0/10](3500/50336) || training loss 267.8326 ||
Epoch[0/10](4000/50336) || training loss 260.818 ||
Epoch[0/10](4500/50336) || training loss 255.1404 ||
Epoch[0/10](5000/50336) || training loss 247.598 ||
Epoch[0/10](5500/50336) || training loss 242.8434 ||
Epoch[0/10](6000/50336) || training loss 237.694 ||
Epoch[0/10](6500/50336) || training loss 233.9127 ||
Epoch[0/10](7000/50336) || training loss 229.0778 ||
Epoch[0/10](7500/50336) || training loss 225.8099 ||
Epoch[0/10](8000/50336) || training loss 223.1073 ||
Epoch[0/10](8500/50336) || training loss 218.594 ||
Epoch[0/10](9000/50336) || training loss 215.8828 ||
Epoch[0/10](9500/50336) |

KeyboardInterrupt: 

In [65]:
test_dataset = SequentialDataset(df)
test_loader = DataLoader(
    test_dataset,
    batch_size=1,
    shuffle=False, 
    drop_last=False,
    collate_fn=make_batch,
)

In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FPMC(
    user_num = train_loader.dataset.n_user, 
    item_num = train_loader.dataset.n_item,
    factor_num = 10, 
).to(device)

model.load_state_dict(torch.load('./best.pth', map_location=device))

<All keys matched successfully>

In [77]:
sub_u = []
sub_i = []

print("Calculating inference results...", end=' ')

with torch.no_grad():
    model.eval()
    
    for batch in tqdm(test_loader):
        user, pos_item, neg_item, item_seq, seq_len =\
            (v.to(device) for _,v in batch.items())

        prediction = model.predict(user, pos_item, item_seq, seq_len).squeeze()

        ranking = torch.topk(prediction, len(prediction))[1]
        
        pred = []
        for item_id in ranking :
            if item_id in item_seq :
                continue
            u = id_to_user[int(user)]
            i = id_to_movie[int(item_id)]
            sub_u.append(u)
            sub_i.append(i)
            pred.append(i)
            if len(pred) == 10 :
                break

print('Done!!')

Calculating inference results... 

 73%|███████▎  | 22771/31360 [03:31<01:14, 114.93it/s]

In [None]:
submission = {"user" : sub_u, "item" : sub_i}
submission_df = pd.DataFrame(submission)
submission_df.to_csv(f'/opt/ml/movie-recommendation/BPR/output/fpmc.csv', index=False)

In [6]:
class SequentialDatasetv2(Dataset):
    def __init__(self, data, num_negative=10, window_size=10, is_training=False) :
        self.data = data[['user', 'item']]
        self.n_user = self.data['user'].nunique() 
        self.n_item = self.data['item'].nunique()
        self.num_negative = num_negative
        self.is_training = is_training
        self.window_size = window_size//2
        
        self.user2seq = dict()
        user_item_sequence = list(self.data.groupby(by='user')['item'])
        for user, item_seq in user_item_sequence :
            self.user2seq[user] = list(item_seq)
        
        if not self.is_training :
            self.user_id = list(self.user2seq.keys())[0]
            self.item_seq = list(self.user2seq.values())[0]

    def negative_sampling(self):
        assert self.is_training, 'no need to sampling when testing'
        negative_samples = []
        
        for u, i in self.data.values:
            for _ in range(self.num_negative):
                j = np.random.randint(self.n_item)
                while j in self.user2seq[u]:
                    j = np.random.randint(self.n_item)
                negative_samples.append([u, i, j])
        self.features = negative_samples

    def __len__(self):
        return self.num_negative * len(self.data) if self.is_training else len(self.item_seq)
    
    def __getitem__(self, idx):
        if self.is_training :
            user = self.features[idx][0]
            pos_item = self.features[idx][1]
            neg_item = self.features[idx][2]
            user_seq = self.user2seq[user]
            item_seq = list()

            index = self.find_index(user_seq, pos_item)
            
            item_seq.extend(user_seq[index-self.window_size:index] if index >= self.window_size else user_seq[0:index])
            item_seq.extend(user_seq[index+1 : index+1+self.window_size])

            return {"user":torch.tensor(user),
                    "pos_item": torch.tensor(pos_item),
                    "neg_item": torch.tensor(neg_item),
                    "item_seq": torch.tensor(item_seq),
                    "seq_len": torch.tensor(len(item_seq)),}

        item_seq = self.item_seq[idx-self.window_size:idx+1+self.window_size] if idx >= self.window_size \
                else self.item_seq[0:idx+1+self.window_size]

        return {"user":torch.tensor(self.user_id),
                "pos_item": torch.arange(0,6807),
                "neg_item": torch.tensor([0]),
                "item_seq": torch.tensor(item_seq),
                "seq_len": torch.tensor(len(item_seq)),}
    
    def find_index(self, seq, item):
        index = 0 
        for i in seq:
            if i == item:
                break
            index += 1
        return index

In [6]:
class SequentialDatasetv3(Dataset):
    def __init__(self, data, num_negative=10, is_training=False) :
        self.data = data[['user', 'item']]
        self.n_user = self.data['user'].nunique() 
        self.n_item = self.data['item'].nunique()
        self.num_negative = num_negative
        self.is_training = is_training
        
        self.user2seq = dict()
        user_item_sequence = list(self.data.groupby(by='user')['item'])
        for user, item_seq in user_item_sequence :
            self.user2seq[user] = list(item_seq)
        
        if not self.is_training :
            self.user = list(self.user2seq.keys())[0]
            self.item_seq = list(self.user2seq.values())[0]

    def negative_sampling(self):
        assert self.is_training, 'no need to sampling when testing'
        negative_samples = []
        
        for u, i in self.data.values:
            for _ in range(self.num_negative):
                j = np.random.randint(self.n_item)
                while j in self.user2seq[u]:
                    j = np.random.randint(self.n_item)
                negative_samples.append([u, i, j])
        self.features = negative_samples

    def __len__(self):
        return self.num_negative * len(self.data) if self.is_training else len(self.item_seq)
    
    def __getitem__(self, idx):
        if self.is_training :
            user = self.features[idx][0]
            pos_item = self.features[idx][1]
            neg_item = self.features[idx][2]
            user_seq = self.user2seq[user]
            item_seq = list()

            for item in user_seq :
                if item == pos_item :
                    break
                item_seq.append(item)
            
            if len(item_seq) == 0 :
                item_seq.append(user_seq[1])
        
        return {"user":torch.tensor(user),
                "pos_item": torch.tensor(pos_item),
                "neg_item": torch.tensor(neg_item),
                "item_seq": torch.tensor(item_seq),
                "seq_len": torch.tensor(len(item_seq)),}\
                if self.is_training else \
                {"user":torch.tensor(self.user),
                "pos_item": torch.arange(0,6807),
                "neg_item": torch.tensor([0]),
                "item_seq": torch.tensor(self.item_seq[:idx+1] if idx>0 else self.item_seq[:2]),
                "seq_len": torch.tensor(idx+1 if idx>0 else 2),}

In [7]:
data_dir = '/opt/ml/movie-recommendation/data/train/'

train_dataset = SequentialDatasetv3(df, num_negative=10, is_training=True)
user_item_dfs = df.groupby('user')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FPMC(
    user_num = train_dataset.n_user, 
    item_num = train_dataset.n_item,
    factor_num = 32, 
).to(device)

model_dir = '/opt/ml/movie-recommendation/BPR/model/exp/epoch-9.pth'
model.load_state_dict(torch.load(model_dir, map_location=device))
batch_size = 512

In [11]:
sub_u = []
sub_i = []

print("Calculating inference results...", end=' ')

for user_id, item_df in tqdm(user_item_dfs):
    test_dataset = SequentialDatasetv2(item_df)
    test_loader = DataLoader(
        test_dataset,
        batch_size=256,
        shuffle=False, 
        drop_last=False,
        collate_fn=make_batch,
    )
    with torch.no_grad():
        model.eval()
        
        prediction = torch.zeros(6807).to(device)

        for batch in test_loader:
            user, pos_item, neg_item, item_seq, seq_len =\
                (v.to(device) for _,v in batch.items())

            output = model.predict(user, pos_item, item_seq, seq_len)
            prediction += output.sum(axis=0)

        ranking = torch.topk(prediction, len(prediction))[1]
    
    pred = []
    for item_id in ranking :
        if item_id in test_dataset.item_seq :
            continue
        u = id_to_user[int(user_id)]
        i = id_to_movie[int(item_id)]
        sub_u.append(u)
        sub_i.append(i)
        pred.append(i)
        if len(pred) == 10 :
            break

print('Done!!')

Calculating inference results... 

100%|██████████| 31360/31360 [1:16:03<00:00,  6.87it/s]  

Done!!





In [12]:
submission = {"user" : sub_u, "item" : sub_i}
submission_df = pd.DataFrame(submission)
submission_df.to_csv(f'/opt/ml/movie-recommendation/BPR/output/fpmcV3.csv', index=False)