In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from scipy.sparse import dok_matrix
import json
from typing import Tuple

In [73]:
def zero_based_mapping(data) :
    with open('/opt/ml/movie-recommendation/data/train/zero_mapping.json', 'r') as f:
        dict_data= json.load(f)

    data['user']  = data['user'].map(lambda x : dict_data['user'][str(x)])
    data['item']  = data['item'].map(lambda x : dict_data['item'][str(x)])
    
    return data

In [84]:
class FMDataset(Dataset):
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)

        self.data = zero_based_mapping(self.data)
        self.attributes = self.get_item_attributes()

        self.X = torch.tensor(np.array(self.data.loc[:, ['user', 'item']])).long()
        self.y = torch.tensor(np.array(self.data.loc[:, 'rating'])).long()

    def __getitem__(self, index):
        item_i = self.X[index, 1]
        X = torch.cat([self.X[index], self.attributes[item_i]])
        return X, self.y[index]

    def __len__(self):
        return len(self.data)

    def split_dataset(self, train_ratio=0.9) -> Tuple[Subset, Subset]:
        train_size = int(train_ratio * len(self.data))
        test_size = len(self.data) - train_size
        train_dataset, test_dataset = random_split(self, [train_size, test_size])
        
        return train_dataset, test_dataset
    
    def get_item_attributes(self):
        data_dir = '/opt/ml/movie-recommendation/data/train/'

        with open(data_dir+'item2attributes.json', 'r') as f:
            item2attributes = json.load(f)

        attributes = []

        for item in range(6807):    
            attribute = [0] * 18
            now_attribute = item2attributes[str(item)]
            for a in now_attribute[1:]:
                attribute[a] = 1
            attributes.append([now_attribute[0]]+attribute)
        
        return torch.tensor(attributes)

In [122]:
class EmbeddingLayer(nn.Module):
    def __init__(self, input_dim, embedding_dim, field_num, offsets):
        super(EmbeddingLayer, self).__init__()

        self.field_num = field_num
        self.offsets = torch.tensor(offsets, device='cuda')
        self.embedding = nn.Embedding(input_dim+1, embedding_dim, padding_idx=self.offsets[-1])

    def forward(self, x):
        one_hot_x = x[:,:self.field_num-1]
        multi_hot_x = x[:,self.field_num-1:].clone()

        embed_x = self.embedding(one_hot_x + self.offsets[:-1])

        sum_embed = []

        indices = multi_hot_x.nonzero()
        multi_hot_x[indices[:,0], indices[:,1]] = indices[:,1]+1
        embed = self.embedding(multi_hot_x + self.offsets[-1])
        sum_embed = torch.sum(embed, axis=1)

        embed_x= torch.cat([embed_x, sum_embed.unsqueeze(1)], axis=1)

        return embed_x

In [123]:
class FM(nn.Module):
    def __init__(self, input_dims, embedding_dim):
        super(FM, self).__init__()
        self.field_num = len(input_dims)
        total_input_dim = int(sum(input_dims))
        self.offsets = [0]+input_dims[:-1]

        self.bias = nn.Parameter(torch.zeros((1,)))
        self.fc = EmbeddingLayer(total_input_dim+1, 1, self.field_num, self.offsets)
        
        self.embedding = EmbeddingLayer(total_input_dim+1, embedding_dim, self.field_num, self.offsets)
        self.embedding_dim = self.field_num * embedding_dim

    def fm(self, x, embed_x):
        fm_y = self.bias + torch.sum(self.fc(x), dim=1)
        square_of_sum = torch.sum(embed_x, dim=1) ** 2         
        sum_of_square = torch.sum(embed_x ** 2, dim=1)
        fm_y += 0.5 * torch.sum(square_of_sum - sum_of_square, dim=1, keepdim=True)
        return fm_y

    def forward(self, x):
        #embedding component
        embed_x = self.embedding(x)
        #fm component
        fm_y = self.fm(x, embed_x).squeeze(1)

        y = torch.sigmoid(fm_y)
        return y

In [124]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset = FMDataset('/opt/ml/movie-recommendation/data/train/fm/Negative Sampled Ratings.csv')

train_set, valid_set = dataset.split_dataset()

train_loader = DataLoader(
    train_set,
    batch_size=1024,
    num_workers=4,
    shuffle=True,
    drop_last=True,
)

valid_loader = DataLoader(
    valid_set,
    batch_size=1024,
    num_workers=4,
    shuffle=False,
    drop_last=True,
)

model = FM(
    input_dims=[31360,6807,12,18],
    embedding_dim=10
).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [125]:
epochs = 100

for epoch in range(epochs):
	model.train() 

	for X,y in train_loader:
		X = X.to(device)
		y = y.to(device)

		model.zero_grad()
		outs = model(X)
		loss = criterion(outs, y.float())
		loss.backward()
		optimizer.step()
	
	print(f"Calculating validation results... {epoch}/{epochs}")
	
	with torch.no_grad():
		model.eval()

		val_acc_items = []
		for X,y in valid_loader:
			X = X.to(device)
			y = y.to(device)

			outs = model(X)
			pred = torch.round(outs)

			acc_item = (y == pred).sum().item()
			val_acc_items.append(acc_item)
		
		val_acc = np.sum(val_acc_items) / len(valid_set)

	print(f"[Val] accuracy: {val_acc:4.4%}")

Calculating validation results... 0/100
[Val] accuracy: 76.4695%
Calculating validation results... 1/100
[Val] accuracy: 81.9613%
Calculating validation results... 2/100
[Val] accuracy: 84.6191%
Calculating validation results... 3/100
[Val] accuracy: 85.9275%
Calculating validation results... 4/100
[Val] accuracy: 86.6377%
Calculating validation results... 5/100
[Val] accuracy: 87.2488%
Calculating validation results... 6/100
[Val] accuracy: 87.7996%
Calculating validation results... 7/100
[Val] accuracy: 88.2393%
Calculating validation results... 8/100
[Val] accuracy: 88.6426%
Calculating validation results... 9/100
[Val] accuracy: 88.9504%
Calculating validation results... 10/100
[Val] accuracy: 89.1771%
Calculating validation results... 11/100
[Val] accuracy: 89.3829%
Calculating validation results... 12/100
[Val] accuracy: 89.5556%
Calculating validation results... 13/100
[Val] accuracy: 89.7112%
Calculating validation results... 14/100
[Val] accuracy: 89.7971%
Calculating validati

KeyboardInterrupt: 

In [126]:
model.fc

EmbeddingLayer(
  (embedding): Embedding(38199, 1, padding_idx=12)
)

In [127]:
model.embedding

EmbeddingLayer(
  (embedding): Embedding(38199, 10, padding_idx=12)
)

In [128]:
model.bias

Parameter containing:
tensor([-3.3509], device='cuda:0', requires_grad=True)

In [49]:
x,y = train_loader.dataset[0]
x=x.to(device)
x.view(1,-1).size()

torch.Size([1, 21])

In [50]:
embed_x = model.embedding(x.view(1,-1))
fm_y = model.bias + torch.sum(model.fc(x.view(1,-1)), dim=1)
square_of_sum = torch.sum(embed_x, dim=1) ** 2         
sum_of_square = torch.sum(embed_x ** 2, dim=1)
(square_of_sum - sum_of_square).size()

torch.Size([1, 100])

In [59]:
((square_of_sum - sum_of_square) + fm_y).size()

torch.Size([1, 100])