In [2]:
import os
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import csr_matrix

In [3]:
# model setting
max_len = 50
hidden_units = 50
num_heads = 1
num_layers = 2
dropout_rate=0.5
num_workers = 1
device = 'cuda'

# training setting
lr = 0.001
batch_size = 128
num_epochs = 200
mask_prob = 0.15 # for cloze task

model_save_dir = '/opt/ml/input/experiment/'
model_save_file = 'bert4rec_model.pt'

In [10]:
############# 중요 #############
# data_path는 사용자의 디렉토리에 맞게 설정해야 합니다.
data_path = './input/data/train/train_ratings.csv'
genre_path = './input/data/train/genres.tsv'

df = pd.read_csv(data_path)
genre_df = pd.read_csv(genre_path, sep='\t')

item_ids = df['item'].unique()
user_ids = df['user'].unique()
genre_ids = genre_df['genre'].unique()
num_item, num_user = len(item_ids), len(user_ids)
num_batch = num_user // batch_size

item_to_index, user_to_index, genre_to_index = dict(), dict(), dict()
index_to_item, index_to_user, index_to_genre = dict(), dict(), dict()

for index, item_id in enumerate(item_ids):
    item_to_index[item_id] = index + 1
    index_to_item[index + 1] = item_id

for index, user_id in enumerate(user_ids):
    user_to_index[user_id] = index
    index_to_user[index] = user_id

for index, genre_id in enumerate(genre_ids):
    genre_to_index[genre_id] = index
    index_to_genre[index] = genre_id

# user, item indexing
item2idx = pd.Series(data=np.arange(len(item_ids))+1, index=item_ids) # item re-indexing (1~num_item), num_item+1: mask idx
user2idx = pd.Series(data=np.arange(len(user_ids)), index=user_ids) # user re-indexing (0~num_user-1)
genre2idx = pd.Series(data=np.arange(len(genre_ids)), index=genre_ids)

         item        genre
0         318        Crime
1         318        Drama
2        2571       Action
3        2571       Sci-Fi
4        2571     Thriller
...       ...          ...
15928  109850        Drama
15929    8605       Action
15930    8605       Comedy
15931    3689       Comedy
15932    8130  Documentary

[15933 rows x 2 columns]


In [11]:
# dataframe indexing
df = pd.merge(df, pd.DataFrame({'item': item_ids, 'item_idx': item2idx[item_ids].values}), on='item', how='inner')
df = pd.merge(df, pd.DataFrame({'user': user_ids, 'user_idx': user2idx[user_ids].values}), on='user', how='inner')

In [None]:
genre_df['genre'] = genre_df['genre'].map(lambda x: genre_to_index[x])

In [14]:
print(pd.Series(genre_to_index))
print(genre_df)

Crime           0
Drama           1
Action          2
Sci-Fi          3
Thriller        4
Comedy          5
Romance         6
War             7
Adventure       8
Fantasy         9
Horror         10
Mystery        11
Animation      12
Children       13
Film-Noir      14
Musical        15
Western        16
Documentary    17
dtype: int64
         item  genre
0         318      0
1         318      1
2        2571      2
3        2571      3
4        2571      4
...       ...    ...
15928  109850      1
15929    8605      2
15930    8605      5
15931    3689      5
15932    8130     17

[15933 rows x 2 columns]


In [19]:
temp_df = genre_df.groupby(['item'])['genre'].apply(list).reset_index(name='genre')
print(temp_df)

        item              genre
0          1  [8, 12, 13, 5, 9]
1          2         [8, 13, 9]
2          3             [5, 6]
3          4          [5, 1, 6]
4          5                [5]
...      ...                ...
6802  118700                [1]
6803  118900                [1]
6804  118997     [13, 5, 9, 15]
6805  119141             [2, 5]
6806  119145       [2, 8, 5, 0]

[6807 rows x 2 columns]


In [20]:
def genre_one_hot_encoding(x):
    genre_list = [0] * 18
    genre_list[x] = 1
    return genre_list