In [44]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import time

In [4]:
data_dir = '/opt/ml/movie-recommendation/data/train/'

In [23]:
# Rating df 생성
rating_data = data_dir + "/train_ratings.csv"
raw_rating_df = pd.read_csv(rating_data)

raw_rating_df.drop(['time'],axis=1,inplace=True)

In [24]:
# Genre df 생성
genre_data = data_dir + "genres.tsv"
raw_genre_df = pd.read_csv(genre_data, sep='\t')

multi_hot_encoded = raw_genre_df.pivot_table(index=['item'], columns=['genre'], aggfunc=[len], fill_value=0)
multi_hot_encoded = multi_hot_encoded.droplevel(level=0, axis=1)

multi_hot_encoded


genre,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118700,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
118900,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
118997,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0
119141,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
data = pd.merge(raw_rating_df, multi_hot_encoded, left_on='item', right_on='item', how='inner')
data = data.sort_values(by=['user'])
data

Unnamed: 0,user,item,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,11,4643,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
324906,11,37830,1,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
325641,11,60040,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
327148,11,34319,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
329565,11,8644,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2913055,138493,2085,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
477340,138493,8636,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1490967,138493,44022,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
369728,138493,1748,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0


In [41]:
multi_hot_cols = data.columns.difference(['user', 'item'])
multi_hot_x = torch.tensor(np.array(data.loc[:,multi_hot_cols]))
multi_hot_x

tensor([[1, 1, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 1, 1,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 1, 0]])

In [49]:
n_genre = len(multi_hot_cols)
embedding_layer = nn.Embedding(n_genre, 10)

In [50]:
start = time.time()
sum_embed = []

for mhx in multi_hot_x :
    indices = torch.where(mhx)
    embed = embedding_layer(indices[0])
    sum_embed.append(torch.sum(embed, axis=0).unsqueeze(0))

sum_embed = torch.cat(sum_embed, axis=0)
print(time.time() - start)

196.9965476989746


In [35]:
x = multi_hot_x.clone()
indices = x.nonzero()
x[indices[:,0], indices[:,1]] = indices[:,1]+1

In [36]:
print('non-zero indices\n',multi_hot_x[:5].nonzero())
print('transformed matrix\n',x[:5])

non-zero indices
 tensor([[ 0,  0],
        [ 0,  1],
        [ 0,  7],
        [ 0, 14],
        [ 1,  0],
        [ 1,  1],
        [ 1,  2],
        [ 1,  8],
        [ 1, 14],
        [ 2,  0],
        [ 2, 14],
        [ 3,  0],
        [ 3, 14],
        [ 3, 15],
        [ 4,  0],
        [ 4,  1],
        [ 4, 14],
        [ 4, 15]])
transformed matrix
 tensor([[ 1,  2,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0, 15,  0,  0,  0],
        [ 1,  2,  3,  0,  0,  0,  0,  0,  9,  0,  0,  0,  0,  0, 15,  0,  0,  0],
        [ 1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15,  0,  0,  0],
        [ 1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15, 16,  0,  0],
        [ 1,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 15, 16,  0,  0]])


In [51]:
embedding_layer = nn.Embedding(n_genre+1, 10, padding_idx=0)

In [52]:
start = time.time()

x = multi_hot_x.clone()
indices = x.nonzero()
x[indices[:,0], indices[:,1]] = indices[:,1]+1
embed = embedding_layer(x)
sum_embed = torch.sum(embed, axis=1)

print(time.time() - start)

29.018701791763306
