In [2]:
import os
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import pickle

## Model Define

In [3]:
class FFMLayer(nn.Module):
    def __init__(self, field_dims, factor_dim):
        '''
        Parameter
            field_dims: List of field dimensions 
                        The sum become the entire dimension of input (in sparse feature)
                        The length become the number of fields
            factor_dim: Factorization dimension
        '''
        super().__init__()
        self.num_fields = len(field_dims)
        self.input_dim = sum(field_dims) #.to(device)
        self.embedding = nn.ModuleList([
            # FILL HERE : Fill in the places `None` with                                      #
            #             either `factorization_dim`, `self.num_fields`, or `self.input_dim`. #
            nn.Embedding(
                self.input_dim, factor_dim
            ) for _ in range(self.num_fields)
        ])

    def forward(self, x):
        '''
        Parameter
            x: Long tensor of size "(batch_size, num_fields)"
               Each value of variable is an index calculated including the dimensions up to the previous variable.
               for instance, [gender:male, age:20, is_student:True] 
                             -> [1,0, 0,1,0,0,0,0, 0,1] in one-hot encoding
                             -> x = [0,3,9].
        Return
            y: Float tensor of size "(batch_size)"
        '''
        
        xv = [self.embedding[f](x) for f in range(self.num_fields)]
        
        y = list()
        for f in range(self.num_fields):
            for g in range(f + 1, self.num_fields):
                y.append(xv[f][:, g] *  xv[g][:, f])
        y = torch.stack(y, dim=1)
        
        return torch.sum(y, dim=(2,1))

class FieldAwareFM(nn.Module):
    def __init__(self, field_dims, factor_dim):
        '''
        Parameter
            field_dims: List of field dimensions
            factor_dim: Factorization dimension
        '''
        super().__init__()
        self.input_dim = sum(field_dims)
        self.encoding_dims = np.concatenate([[0], np.cumsum(field_dims)[:-1]])
        self.linear = nn.Linear(self.input_dim, 1, bias=True) # FILL HERE : Fill in the places `None` #
        self.ffm = FFMLayer(field_dims, factor_dim) # FILL HERE : Fill in the places `None` #
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Embedding):
                nn.init.xavier_uniform_(m.weight)
            elif isinstance(m, FFMLayer):
                nn.init.normal_(m.v, 0, 0.01)

    def forward(self, x):
        '''
        Parameter
            x: Long tensor of size "(batch_size, num_fields)"
                x_multihot: Multi-hot coding of x. size "(batch_size, self.input_dim)"
        
        Return
            y: Float tensor of size "(batch_size)"
        '''
        dims = torch.tensor(self.input_dim).to(device)
        x = x + x.new_tensor(self.encoding_dims).unsqueeze(0)
        x_multihot = torch.zeros(x.size(0), dims).to(device).scatter_(1, x, 1.)
        
        y = self.linear(x_multihot).squeeze(1) + self.ffm(x) # FILL HERE : Use `self.linear()` and `self.ffm()` #

        return y

### Data Load

In [4]:
inference_df = pd.read_csv("inference_base.csv")

# load data
with open('user_dict.pickle', 'rb') as fr:
    user_dict = pickle.load(fr)

# load data
with open('item_dict.pickle', 'rb') as fr:
    item_dict = pickle.load(fr)
print("users :", len(user_dict)) #31360
print("items :", len(item_dict)) #6807

inference_df.sort_values(by="user",axis = 0, inplace = True)

users : 31360
items : 6807


In [10]:
a = list(user_dict.values())
assert a[0] == 1

AssertionError: 

In [1]:
# cuda setting
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

batch_size = 1024

inference_dataset = TensorDataset(torch.LongTensor(np.array(inference_df)))
inference_dataloader = DataLoader(inference_dataset,
                                  batch_size=batch_size,
                                  pin_memory=use_cuda,
                                  drop_last=False,
                                  shuffle=False,
                                  num_workers = 4,
                                  )

print("dataset length :", len(inference_dataloader))

NameError: name 'torch' is not defined

## Model load

In [None]:
model = torch.load(f"FFM.pth").to(device)
model.eval()

## Inference

In [None]:
user_list = list()
score_list = list()
item_list = list()

with torch.no_grad():
    cnt = 0
    for batch in tqdm(inference_dataloader):
        x = batch[0].to(device) 
        # print ("[DEBUG] model input x-----")
        
        # print ("--------------------------")
        output = model(x) #[B] ///x 에 대한 점수
        #idx = torch.where(output >= 1)[0] # 점수가 1 이상인 index
        
        info = x.cpu()
        #scores = output.index_select(0,idx).cpu().tolist()
        scores = output.cpu().tolist()
        users = info[:,0].tolist()
        items = info[:,1].tolist()

        user_list += users
        item_list += items
        score_list += scores

np_user_list = np.array(user_list)
np_item_list = np.array(item_list)
np_score_list = np.array(score_list)

## Top 10 추출

In [None]:
users = list()
items = list()
for key, user_code in tqdm(user_dict.items()):
    u_code = int(user_code)

    idx = np.where(np_user_list == u_code)[0].tolist()
    
    item_score = np_score_list.take(idx) #user code 에 해당하는 item_score
    item_ = np_item_list.take(idx) # user code에 해당하는 item
    top10_idx = np.argpartition(item_score, -10)[-10:] # 상위 10개 index 추출

    top10_item = [int(item_dict[code]) for code in item_.take(top10_idx)] #top 10(item code -> item id)
    user_id = [user_dict[u_code]] * 10

    users += user_id
    items += top10_item

result = np.vstack((users,items)).T

### submission 생성

In [None]:
info = pd.DataFrame(result, columns=['user','item'])
info.to_csv("FFM_submission.csv",index=False)

### Recall@10 테스트

In [None]:
import json

print("testing recall@10...")
# 학습에 사용된 user만 uniq_user에 저장
uniq_user = list(user_dict.values())
print (f"Number of users : {len(uniq_user)}")

with open("/opt/ml/input/workspace/BERT4Rec/data/answers.json", "r") as json_file: #answer.json 경로 지정
    answer = json.load(json_file)

# movielens-20m과 submission을 비교하여 Recall@10 값 계산
submission_df = pd.read_csv(f"submission.csv")
recall_result = []

# 각 유저마다 recall@10 계산하여 list에 저장
for user in tqdm(uniq_user):
    submission_by_user = submission_df[submission_df['user'] == user]['item']

    hit = 0
    for item in submission_by_user:
        if item in answer[str(user)]:
            hit += 1

        recall_result.append(hit / 10)

# 전체 유저의 Recall@10의 평균 출력
print (f"Predicted submission result of Recall@10 = {np.average(recall_result)}")