In [2]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import pickle

In [3]:
###############################################################################
# 주어진 결과와 정확히 비교하기 위한 random seed 고정
###############################################################################

seed = 42  # 바꾸지 마시오!
random.seed(seed)
np.random.seed(seed)

In [4]:
base_path = os.path.join(os.curdir, 'data/train/')
data_path = os.path.join(base_path, 'rating.csv')
genre_path = os.path.join(base_path, "genres.tsv")
writer_path = os.path.join(base_path, "writers.tsv")
director_path = os.path.join(base_path, "directors.tsv")

director_path

'./data/train/directors.tsv'

In [7]:
# 평점 데이터
rating_df = pd.read_csv(data_path,index_col = 0)
rating_df.rating = 1
rating_df

Unnamed: 0,user,item,rating
0,11,4643,1
1,11,170,1
2,11,531,1
3,11,616,1
4,11,2140,1
...,...,...,...
5154466,138493,44022,1
5154467,138493,4958,1
5154468,138493,68319,1
5154469,138493,40819,1


In [6]:
rating_df.user = rating_df.user.astype('category').cat.codes
rating_df

Unnamed: 0,user,item,rating
0,0,4643,1
1,0,170,1
2,0,531,1
3,0,616,1
4,0,2140,1
...,...,...,...
5154466,31359,44022,1
5154467,31359,4958,1
5154468,31359,68319,1
5154469,31359,40819,1


In [47]:
assert rating_user_dict == user_dict

In [50]:
def get_popular_item(rating_df):
    df = rating_df.pivot_table(index=['item'], aggfunc='size').sort_values(ascending=True)
    df = df[df > 1000]
    ret = df.index.to_numpy()
    return set(ret.flatten())

popular_item = get_popular_item(rating_df)

{0,
 1,
 4098,
 4099,
 6148,
 2053,
 5,
 4101,
 2056,
 9,
 10,
 4107,
 2060,
 4108,
 6158,
 6159,
 16,
 4113,
 18,
 4115,
 2068,
 20,
 4116,
 2071,
 24,
 2069,
 2067,
 6171,
 28,
 6173,
 6172,
 6175,
 2080,
 33,
 34,
 32,
 31,
 2086,
 41,
 42,
 43,
 2093,
 46,
 6190,
 49,
 53,
 69,
 72,
 2121,
 73,
 6219,
 2124,
 2125,
 15,
 77,
 76,
 87,
 2135,
 2137,
 89,
 2138,
 2140,
 92,
 94,
 6239,
 101,
 103,
 105,
 2156,
 109,
 2157,
 6257,
 116,
 2164,
 2166,
 118,
 2170,
 2176,
 6275,
 133,
 136,
 137,
 2186,
 2187,
 140,
 2192,
 6290,
 2195,
 146,
 149,
 2199,
 2200,
 154,
 2202,
 2204,
 2205,
 6300,
 158,
 6304,
 2208,
 162,
 2211,
 2212,
 165,
 161,
 2217,
 6315,
 6317,
 6318,
 6319,
 174,
 177,
 178,
 179,
 181,
 6327,
 6329,
 185,
 187,
 186,
 191,
 192,
 193,
 194,
 2243,
 4296,
 2249,
 201,
 203,
 6348,
 4301,
 2251,
 207,
 208,
 2257,
 210,
 6352,
 205,
 209,
 2262,
 215,
 6360,
 2264,
 214,
 218,
 220,
 221,
 4321,
 226,
 229,
 230,
 231,
 237,
 239,
 6388,
 249,
 2301,
 2304,
 260,


In [76]:
def get_inference_df(rating_df):
    user_group_dfs = list(rating_df.groupby('user')['item'])
    popular_item = get_popular_item(rating_df)
    dfs = np.array([]).reshape(0, 2)
    user_list = list()
    target_list = list()
    for user, user_seen_list in tqdm(user_group_dfs):

        user_seen_set = set(user_seen_list)
        
        targets = list(popular_item - user_seen_set)

        target_len = len(targets)

        users = [user] * target_len
        
        user_list += users
        target_list += targets

    dfs = np.vstack((user_list,target_list)).T
    inference_df = pd.DataFrame(data=dfs, columns=["user", "item"])
    return inference_df

inf_df = get_inference_df(rating_df)


100%|██████████| 31360/31360 [00:04<00:00, 7502.93it/s]


In [81]:
inf_df.to_csv("inf_df.csv",index=False)

# Negative sample

In [8]:
items = set(rating_df.loc[:, 'item'])
user_group_dfs = list(rating_df.groupby('user')['item'])
    
user_neg_dfs = np.array([]).reshape(0, 3)
user_group_dfs

num_negative = 100

for u, user_seen_list in tqdm(user_group_dfs):
        
    #-- User가 시청한 영화 집합
    user_seen_set = set(user_seen_list)
        
    #-- 시청한 영화를 제외한 num_negative개의 영화 선택
    i_user_neg_item = np.random.choice(list(items - user_seen_set), num_negative, replace=False)
        
        #-- negative sample item's rating = 0
    neg_users = np.full(num_negative, u)
    neg_ratings = np.zeros(num_negative)
        
    #-- user u 에 대한 negative sample 결과 생성 : ["neg_user", "neg_item", "neg_rate"]
    neg_results = np.vstack((neg_users, i_user_neg_item, neg_ratings)).T
    user_neg_dfs = np.vstack((user_neg_dfs, neg_results))
    
neg_rating_df = pd.DataFrame(data=user_neg_dfs, columns=["user", "item", "rating"])
rating_df = pd.concat([rating_df, neg_rating_df], axis=0, sort=False)

100%|██████████| 31360/31360 [10:38<00:00, 49.10it/s]


In [9]:
rating_df.to_csv('rating_negative_samples_100.csv', index= False)

In [11]:
print(len(rating_df)) #before 6722471 
rating_df.head(20)

8290471


Unnamed: 0,user,item,rating
0,11.0,4643.0,1.0
1,11.0,170.0,1.0
2,11.0,531.0,1.0
3,11.0,616.0,1.0
4,11.0,2140.0,1.0
5,11.0,2722.0,1.0
6,11.0,2313.0,1.0
7,11.0,2688.0,1.0
8,11.0,2428.0,1.0
9,11.0,3113.0,1.0


In [248]:
len(rating_df)
rating_df

Unnamed: 0,user,item,rating
0,11.0,4643.0,1.0
1,11.0,170.0,1.0
2,11.0,531.0,1.0
3,11.0,616.0,1.0
4,11.0,2140.0,1.0
...,...,...,...
1567995,138493.0,39715.0,0.0
1567996,138493.0,718.0,0.0
1567997,138493.0,6436.0,0.0
1567998,138493.0,3189.0,0.0


In [12]:
# 장르 데이터
gener_df = pd.read_csv(genre_path,delimiter="\t")

# 작가 데이터
writer_df = pd.read_csv(writer_path, delimiter="\t")

# 감독 데이터
director_df = pd.read_csv(director_path, delimiter="\t")

In [13]:
writer_df = writer_df.drop_duplicates(subset=['item'])
writer_np = np.unique(writer_df.writer.to_numpy())
len(writer_np)

2027

In [14]:
gener_df["genre"] = gener_df["genre"].astype("category")
gener_df

Unnamed: 0,item,genre
0,318,Crime
1,318,Drama
2,2571,Action
3,2571,Sci-Fi
4,2571,Thriller
...,...,...
15928,109850,Drama
15929,8605,Action
15930,8605,Comedy
15931,3689,Comedy


In [15]:
temp = pd.get_dummies(gener_df)

0           318
1           318
2          2571
3          2571
4          2571
          ...  
15928    109850
15929      8605
15930      8605
15931      3689
15932      8130
Name: item, Length: 15933, dtype: int64

In [16]:
df = temp.groupby(by=["item"], as_index = False).agg("sum")

In [17]:
df.set_index('item')
names = ['item'] + list(range(1,19))
df.columns = names

In [18]:
df.sort_values('item')

Unnamed: 0,item,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6802,118700,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6803,118900,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6804,118997,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0
6805,119141,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
gener_writer_df = pd.merge(left=df, right=writer_df, how='outer', on='item')

In [21]:
gener_writer_df.writer = gener_writer_df.writer.astype('category').cat.codes

In [22]:
tt = pd.merge(left= rating_df, right=gener_writer_df, how='inner', on='item' )
print(len(tt))
tt

8290471


Unnamed: 0,user,item,rating,1,2,3,4,5,6,7,...,10,11,12,13,14,15,16,17,18,writer
0,11.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
1,189.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
2,294.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
3,383.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
4,421.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8290466,138209.0,102880.0,0.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1713
8290467,138273.0,102880.0,0.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1713
8290468,138295.0,102880.0,0.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1713
8290469,138348.0,102880.0,0.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1713


In [24]:
gener_writer_df.item = gener_writer_df.item.astype('category').cat.codes

In [25]:
gener_writer_df

Unnamed: 0,item,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,writer
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,285
1,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1102
2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1189
3,3,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,437
4,4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6802,6802,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-1
6803,6803,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1136
6804,6804,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1294
6805,6805,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1634


In [19]:
inf_df = pd.read_csv("inf_df.csv")
inf_df

Unnamed: 0,user,item
0,0,1
1,0,4098
2,0,4099
3,0,6148
4,0,2053
...,...,...
32761673,31359,2035
32761674,31359,2036
32761675,31359,2044
32761676,31359,4093


In [26]:
inference_df = pd.merge(left= inf_df, right=gener_writer_df, how='inner', on='item' )
inference_df

Unnamed: 0,user,item,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,writer
0,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1102
1,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1102
2,2,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1102
3,3,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1102
4,4,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32761673,31352,220,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1650
32761674,31353,220,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1650
32761675,31355,220,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1650
32761676,31356,220,0,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,1,0,1650


In [27]:
inference_df[inference_df.writer == -1]

Unnamed: 0,user,item,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,writer
56253,0,4099,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,-1
56254,1,4099,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,-1
56255,2,4099,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,-1
56256,3,4099,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,-1
56257,4,4099,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31724235,31355,3479,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,-1
31724236,31356,3479,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,-1
31724237,31357,3479,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,-1
31724238,31358,3479,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,-1


In [28]:
idx_list = inference_df[inference_df.writer == -1].index
for idx in idx_list:
    inference_df.at[idx,"writer"] = random.randrange(0,2027)

Unnamed: 0,user,item,1,2,3,4,5,6,7,8,...,10,11,12,13,14,15,16,17,18,writer


In [29]:
inference_df.to_csv("inference_base.csv",index=False)

In [23]:
idx_list = tt[tt.writer == -1].index
for idx in idx_list:
    tt.at[idx,"writer"] = random.randrange(0,2027)

tt.loc[1122]

user       116.0
item      4643.0
rating       0.0
1            1.0
2            1.0
3            0.0
4            0.0
5            0.0
6            0.0
7            0.0
8            1.0
9            0.0
10           0.0
11           0.0
12           0.0
13           0.0
14           0.0
15           1.0
16           0.0
17           0.0
18           0.0
writer     536.0
Name: 1122, dtype: float64

In [24]:
tt.to_csv('rating_gener_writer_df_100.csv',index=False)

In [30]:
ffm_df = pd.read_csv('rating_gener_writer_df.csv')

In [4]:
ffm_df.user.astype('category').cat.categories

Float64Index([    11.0,     14.0,     18.0,     25.0,     31.0,     35.0,
                  43.0,     50.0,     58.0,     60.0,
              ...
              138459.0, 138461.0, 138470.0, 138471.0, 138472.0, 138473.0,
              138475.0, 138486.0, 138492.0, 138493.0],
             dtype='float64', length=31360)

In [33]:
user_cat = ffm_df.user.astype('category').cat
user_dict = dict(enumerate(user_cat.categories))
# save data
with open('user_dict.pickle','wb') as fw:
    pickle.dump(user_dict, fw)

In [34]:
item_cat = ffm_df.item.astype('category').cat
item_dict = dict(enumerate(item_cat.categories))
# save data
with open('item_dict.pickle','wb') as fw:
    pickle.dump(item_dict, fw)

In [43]:
# load data
with open('user_dict.pickle', 'rb') as fr:
    user_dict = pickle.load(fr)
    # load data
with open('item_dict.pickle', 'rb') as fr:
    item_dict = pickle.load(fr)
print("users :", len(user_dict)) #31360
print("items :", len(item_dict)) #6807

users : 31360
items : 6807


In [12]:
ffm_df

Unnamed: 0,user,item,rating,1,2,3,4,5,6,7,...,10,11,12,13,14,15,16,17,18,writer
0,11.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
1,189.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
2,294.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
3,383.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
4,421.0,4643.0,1.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6722466,135123.0,102880.0,0.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1713
6722467,135718.0,102880.0,0.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1713
6722468,136042.0,102880.0,0.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1713
6722469,138115.0,102880.0,0.0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1713


### 0.7. FFM 모델에 사용할 데이터셋 만들기

In [31]:
col_len = list() #[31360, 6807, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2027]

for col in ffm_df.columns:
    if col == "rating":
        continue
    category = ffm_df[f"{col}"].astype('category')
    col_len.append(len(ffm_df[f"{col}"].astype('category').cat.categories))
    ffm_df[f"{col}"] = category.cat.codes
    
col_len

[31360, 6807, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2027]

In [32]:
ffm_df = ffm_df

train_X, test_X, train_y, test_y = train_test_split(
    ffm_df.loc[:, ffm_df.columns != 'rating'], ffm_df['rating'], test_size=0.2, random_state=seed)
print('학습 데이터 크기:', train_X.shape, train_y.shape)
print('테스트 데이터 크기:', test_X.shape, test_y.shape)

학습 데이터 크기: (5377976, 21) (5377976,)
테스트 데이터 크기: (1344495, 21) (1344495,)


- `task = 'reg'` : 회귀 문제
    - 학습 : MSE를 손실함수로 사용
    - 검증 : MAE 사용


- `task = 'clf'` : 이진 분류 문제
    - 학습 : Cross Entropy를 손실 함수로 사용 ( 모델의 리턴 값이 logit이므로 `nn.BCEWithLogitsLoss()`를 사용 )
    - 검증 : AUC 사용

In [37]:
class FFMLayer(nn.Module):
    def __init__(self, field_dims, factor_dim):
        '''
        Parameter
            field_dims: List of field dimensions 
                        The sum become the entire dimension of input (in sparse feature)
                        The length become the number of fields
            factor_dim: Factorization dimension
        '''
        super().__init__()
        self.num_fields = len(field_dims)
        self.input_dim = sum(field_dims) #.to(device)
        self.embedding = nn.ModuleList([
            # FILL HERE : Fill in the places `None` with                                      #
            #             either `factorization_dim`, `self.num_fields`, or `self.input_dim`. #
            nn.Embedding(
                self.input_dim, factor_dim
            ) for _ in range(self.num_fields)
        ])

    def forward(self, x):
        '''
        Parameter
            x: Long tensor of size "(batch_size, num_fields)"
               Each value of variable is an index calculated including the dimensions up to the previous variable.
               for instance, [gender:male, age:20, is_student:True] 
                             -> [1,0, 0,1,0,0,0,0, 0,1] in one-hot encoding
                             -> x = [0,3,9].
        Return
            y: Float tensor of size "(batch_size)"
        '''
        
        xv = [self.embedding[f](x) for f in range(self.num_fields)]
        
        y = list()
        for f in range(self.num_fields):
            for g in range(f + 1, self.num_fields):
                y.append(xv[f][:, g] *  xv[g][:, f])
        y = torch.stack(y, dim=1)
        
        return torch.sum(y, dim=(2,1))

class FieldAwareFM(nn.Module):
    def __init__(self, field_dims, factor_dim):
        '''
        Parameter
            field_dims: List of field dimensions
            factor_dim: Factorization dimension
        '''
        super().__init__()
        self.input_dim = sum(field_dims)
        self.encoding_dims = np.concatenate([[0], np.cumsum(field_dims)[:-1]])
        self.linear = nn.Linear(self.input_dim, 1, bias=True) # FILL HERE : Fill in the places `None` #
        self.ffm = FFMLayer(field_dims, factor_dim) # FILL HERE : Fill in the places `None` #
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Embedding):
                nn.init.xavier_uniform_(m.weight)
            elif isinstance(m, FFMLayer):
                nn.init.normal_(m.v, 0, 0.01)

    def forward(self, x):
        '''
        Parameter
            x: Long tensor of size "(batch_size, num_fields)"
                x_multihot: Multi-hot coding of x. size "(batch_size, self.input_dim)"
        
        Return
            y: Float tensor of size "(batch_size)"
        '''
        dims = torch.tensor(self.input_dim).to(device)
        x = x + x.new_tensor(self.encoding_dims).unsqueeze(0)
        x_multihot = torch.zeros(x.size(0), dims).to(device).scatter_(1, x, 1.)
        
        y = self.linear(x_multihot).squeeze(1) + self.ffm(x) # FILL HERE : Use `self.linear()` and `self.ffm()` #

        return y


def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss = 0
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)
        train_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 1000 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    train_loss /= num_batches
    
    return train_loss


def test_loop(dataloader, model, loss_fn, task):
    num_batches = len(dataloader)
    test_loss, y_all, pred_all = 0, list(), list()

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item() / num_batches
            y_all.append(y)
            pred_all.append(pred)
    
    y_all = torch.cat(y_all).cpu()
    pred_all = torch.cat(pred_all).cpu()
    
    if task == 'reg':
        err = abs(pred_all - y_all).type(torch.float).mean().item()
        print(f"Test Error: \n  MAE: {(err):>8f} \n  Avg loss: {test_loss:>8f}")
    else:
        err = roc_auc_score(y_all, torch.sigmoid(pred_all)).item()
        print(f"Test Error: \n  AUC: {err:>8f} \n  Avg loss: {test_loss:>8f}")
    
    return err, test_loss

def train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, epochs, task):
    train_loss, test_err, test_loss = list(), list(), list()
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loss.append(train_loop(train_dataloader, model, loss_fn, optimizer))
        test_result = test_loop(test_dataloader, model, loss_fn, task)
        test_err.append(test_result[0])
        test_loss.append(test_result[1])
        print("-------------------------------\n")
    print("Done!")
    
    return train_loss, test_err, test_loss


# PyTorch의 DataLoader에서 사용할 수 있도록 변환 
train_dataset_ffm = TensorDataset(torch.LongTensor(np.array(train_X)), torch.Tensor(np.array(train_y)))
test_dataset_ffm = TensorDataset(torch.LongTensor(np.array(test_X)), torch.Tensor(np.array(test_y)))

######## Hyperparameter ########

batch_size = 64
data_shuffle = True
task = 'clf'
factorization_dim = 8
epochs = 1
learning_rate = 0.001
gpu_idx = 0

################################
# torch.cuda.empty_cache() # if necessary
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

# cuda setting
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

train_dataloader_ffm = DataLoader(train_dataset_ffm,num_workers = 4, batch_size=batch_size, shuffle=data_shuffle)
test_dataloader_ffm = DataLoader(test_dataset_ffm,num_workers = 4, batch_size=batch_size, shuffle=data_shuffle)
# train_dataloader_ffm = DataLoader(train_dataset_ffm, batch_size=batch_size, shuffle=data_shuffle)
# test_dataloader_ffm = DataLoader(test_dataset_ffm, batch_size=batch_size, shuffle=data_shuffle)

field_dims = col_len # 각 col의 길이
model = FieldAwareFM(field_dims, factorization_dim).to(device)

loss_fn = nn.MSELoss().to(device) if (task == 'clf') else nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.001, amsgrad=True)

train_loss, test_err, test_loss = train_and_test(train_dataloader_ffm, test_dataloader_ffm, 
                                                 model, loss_fn, optimizer, epochs, task)

cuda
Epoch 1
-------------------------------
loss: 1030.102539  [    0/5377976]
loss: 105.327217  [64000/5377976]
loss: 82.347389  [128000/5377976]
loss: 28.503357  [192000/5377976]
loss: 20.612480  [256000/5377976]
loss: 16.333706  [320000/5377976]
loss: 10.996138  [384000/5377976]
loss: 9.602505  [448000/5377976]
loss: 13.287176  [512000/5377976]
loss: 8.214600  [576000/5377976]
loss: 7.492981  [640000/5377976]
loss: 2.553311  [704000/5377976]
loss: 6.406963  [768000/5377976]
loss: 1.968645  [832000/5377976]
loss: 2.047839  [896000/5377976]
loss: 1.574688  [960000/5377976]
loss: 2.384252  [1024000/5377976]
loss: 2.269759  [1088000/5377976]
loss: 1.786258  [1152000/5377976]
loss: 1.434981  [1216000/5377976]
loss: 1.772189  [1280000/5377976]
loss: 1.268059  [1344000/5377976]
loss: 1.151474  [1408000/5377976]
loss: 1.062665  [1472000/5377976]
loss: 1.412100  [1536000/5377976]
loss: 3.220829  [1600000/5377976]
loss: 0.875425  [1664000/5377976]
loss: 0.939457  [1728000/5377976]
loss: 0.77

In [38]:
torch.save(model, "FFM.pth")

### Inference

In [3]:
class FFMLayer(nn.Module):
    def __init__(self, field_dims, factor_dim):
        '''
        Parameter
            field_dims: List of field dimensions 
                        The sum become the entire dimension of input (in sparse feature)
                        The length become the number of fields
            factor_dim: Factorization dimension
        '''
        super().__init__()
        self.num_fields = len(field_dims)
        self.input_dim = sum(field_dims) #.to(device)
        self.embedding = nn.ModuleList([
            # FILL HERE : Fill in the places `None` with                                      #
            #             either `factorization_dim`, `self.num_fields`, or `self.input_dim`. #
            nn.Embedding(
                self.input_dim, factor_dim
            ) for _ in range(self.num_fields)
        ])

    def forward(self, x):
        '''
        Parameter
            x: Long tensor of size "(batch_size, num_fields)"
               Each value of variable is an index calculated including the dimensions up to the previous variable.
               for instance, [gender:male, age:20, is_student:True] 
                             -> [1,0, 0,1,0,0,0,0, 0,1] in one-hot encoding
                             -> x = [0,3,9].
        Return
            y: Float tensor of size "(batch_size)"
        '''
        
        xv = [self.embedding[f](x) for f in range(self.num_fields)]
        
        y = list()
        for f in range(self.num_fields):
            for g in range(f + 1, self.num_fields):
                y.append(xv[f][:, g] *  xv[g][:, f])
        y = torch.stack(y, dim=1)
        
        return torch.sum(y, dim=(2,1))

class FieldAwareFM(nn.Module):
    def __init__(self, field_dims, factor_dim):
        '''
        Parameter
            field_dims: List of field dimensions
            factor_dim: Factorization dimension
        '''
        super().__init__()
        self.input_dim = sum(field_dims)
        self.encoding_dims = np.concatenate([[0], np.cumsum(field_dims)[:-1]])
        self.linear = nn.Linear(self.input_dim, 1, bias=True) # FILL HERE : Fill in the places `None` #
        self.ffm = FFMLayer(field_dims, factor_dim) # FILL HERE : Fill in the places `None` #
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Embedding):
                nn.init.xavier_uniform_(m.weight)
            elif isinstance(m, FFMLayer):
                nn.init.normal_(m.v, 0, 0.01)

    def forward(self, x):
        '''
        Parameter
            x: Long tensor of size "(batch_size, num_fields)"
                x_multihot: Multi-hot coding of x. size "(batch_size, self.input_dim)"
        
        Return
            y: Float tensor of size "(batch_size)"
        '''
        dims = torch.tensor(self.input_dim).to(device)
        x = x + x.new_tensor(self.encoding_dims).unsqueeze(0)
        x_multihot = torch.zeros(x.size(0), dims).to(device).scatter_(1, x, 1.)
        
        y = self.linear(x_multihot).squeeze(1) + self.ffm(x) # FILL HERE : Use `self.linear()` and `self.ffm()` #

        return y

- Inference Data Load

In [4]:
inference_df = pd.read_csv("inference_base.csv")

# load data
with open('user_dict.pickle', 'rb') as fr:
    user_dict = pickle.load(fr)

# load data
with open('item_dict.pickle', 'rb') as fr:
    item_dict = pickle.load(fr)
print("users :", len(user_dict)) #31360
print("items :", len(item_dict)) #6807


users : 31360
items : 6807


In [5]:
inference_df.sort_values(by="user",axis = 0,inplace = True)


- DataLoader setting

In [6]:
# cuda setting
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

cuda


In [80]:
batch_size = 2048

inference_dataset = TensorDataset(torch.LongTensor(np.array(inference_df)))
inference_dataloader = DataLoader(inference_dataset,
                                  batch_size=batch_size,
                                  pin_memory=use_cuda,
                                  drop_last=False,
                                  shuffle=False,
                                  num_workers = 4,
                                  )

- Load Model

In [77]:
model = torch.load(f"FFM.pth").to(device)
model.eval()

FieldAwareFM(
  (linear): Linear(in_features=40230, out_features=1, bias=True)
  (ffm): FFMLayer(
    (embedding): ModuleList(
      (0): Embedding(40230, 8)
      (1): Embedding(40230, 8)
      (2): Embedding(40230, 8)
      (3): Embedding(40230, 8)
      (4): Embedding(40230, 8)
      (5): Embedding(40230, 8)
      (6): Embedding(40230, 8)
      (7): Embedding(40230, 8)
      (8): Embedding(40230, 8)
      (9): Embedding(40230, 8)
      (10): Embedding(40230, 8)
      (11): Embedding(40230, 8)
      (12): Embedding(40230, 8)
      (13): Embedding(40230, 8)
      (14): Embedding(40230, 8)
      (15): Embedding(40230, 8)
      (16): Embedding(40230, 8)
      (17): Embedding(40230, 8)
      (18): Embedding(40230, 8)
      (19): Embedding(40230, 8)
      (20): Embedding(40230, 8)
    )
  )
)

In [92]:
user_list = list()
score_list = list()
item_list = list()

with torch.no_grad():
    cnt = 0
    for batch in tqdm(inference_dataloader):
        x = batch[0].to(device) 
        # print ("[DEBUG] model input x-----")
        
        # print ("--------------------------")
        output = model(x) #[B] ///x 에 대한 점수
        #idx = torch.where(output >= 1)[0] # 점수가 1 이상인 index
        
        info = x.cpu()
        #scores = output.index_select(0,idx).cpu().tolist()
        scores = output.cpu().tolist()
        users = info[:,0].tolist()
        items = info[:,1].tolist()

        user_list += users
        item_list += items
        score_list += scores

np_user_list = np.array(user_list)
np_item_list = np.array(item_list)
np_score_list = np.array(score_list)

100%|██████████| 15997/15997 [22:44<00:00, 11.72it/s]


### TOP 10

In [93]:
users = list()
items = list()
for user_code, user_id in tqdm(user_dict.items()):
    u_id = int(user_id)

    idx = np.where(np_user_list == user_code)[0].tolist()
    
    item_score = np_score_list.take(idx) #user code 에 해당하는 item_score
    item_ = np_item_list.take(idx) # user code에 해당하는 item
    top10_idx = np.argpartition(item_score, -10)[-10:] # 상위 10개 index 추출

    top10_item = [int(item_dict[code]) for code in item_.take(top10_idx)] #top 10(item code -> item id)
    user_id = [u_id] * 10

    users += user_id
    items += top10_item

  5%|▌         | 1662/31360 [01:18<23:38, 20.93it/s]

In [88]:
result = np.vstack((users,items)).T

0 11.0


### SAVE submission.csv

In [None]:
info = pd.DataFrame(result, columns=['user','item'])
info.to_csv("FFM_submission.csv",index=False)

#### Recall@10

In [None]:
import json

print("testing recall@10...")
# 학습에 사용된 user만 uniq_user에 저장
uniq_user = list(user_dict.values())
print (f"Number of users : {len(uniq_user)}")

with open("/opt/ml/input/workspace/BERT4Rec/data/answers.json", "r") as json_file: #answer.json 경로 지정
    answer = json.load(json_file)

# movielens-20m과 submission을 비교하여 Recall@10 값 계산
submission_df = pd.read_csv(f"submission.csv")
recall_result = []

# 각 유저마다 recall@10 계산하여 list에 저장
for user in tqdm(uniq_user):
    submission_by_user = submission_df[submission_df['user'] == user]['item']

    hit = 0
    for item in submission_by_user:
        if item in answer[str(user)]:
            hit += 1

        recall_result.append(hit / 10)

# 전체 유저의 Recall@10의 평균 출력
print (f"Predicted submission result of Recall@10 = {np.average(recall_result)}")

#### 1.3.4. 학습 곡선 도식화

---

## 2. PyTorch로 FFM 구현하기

In [78]:
x = torch.tensor([[5,1,2]])
field_dims = [10,10,10]
input_dim = sum(field_dims)
x_multihot = torch.zeros(1, input_dim).scatter_(1,x,1.)
x_multihot

tensor([[0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [98]:
np.cumsum(field_dims)[:-1]

array([10, 20])

In [89]:
encoding_dims = np.concatenate([[0], np.cumsum(field_dims)[:-1]])
x_new = x.new_tensor(encoding_dims).unsqueeze(0)
x_new


tensor([[ 0, 10, 20]])

In [87]:
x_temp = x + x_new
x_multihot = torch.zeros(1, input_dim).scatter_(1,x_temp,1.)
x_multihot

tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])

In [8]:
dims = np.concatenate([[0], np.cumsum(field_dims)[:-1]])