In [1]:
import os
import numpy as np
import pandas as pd
from copy import deepcopy

In [5]:
data_dir = '/opt/ml/movie-recommendation/data/train/'

# 1. Rating df 생성
rating_data = data_dir + "/Negative Sampled Ratings.csv"
raw_rating_df = pd.read_csv(rating_data)

users = set(raw_rating_df.loc[:, 'user'])
items = set(raw_rating_df.loc[:, 'item'])

In [6]:
raw_rating_df

Unnamed: 0,user,item,rating
0,11,4643,1.0
1,11,170,1.0
2,11,531,1.0
3,11,616,1.0
4,11,2140,1.0
...,...,...,...
6722466,138493,71899,0.0
6722467,138493,65037,0.0
6722468,138493,58,0.0
6722469,138493,252,0.0


In [7]:
#2-1. Genre df 생성
genre_data = data_dir + "genres.tsv"
raw_genre_df = pd.read_csv(genre_data, sep='\t')

multi_hot_encoded = raw_genre_df.pivot_table(index=['item'], columns=['genre'], aggfunc=[len], fill_value=0)
multi_hot_encoded = multi_hot_encoded.droplevel(level=0, axis=1)

multi_hot_encoded

genre,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118700,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
118900,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
118997,0,0,0,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0
119141,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
#2-2. Title df 생성
title_data = data_dir + "titles.tsv"
raw_title_df = pd.read_csv(title_data, sep='\t')

In [9]:
#2-3. Year df 생성
year_data = data_dir + "years.tsv"
raw_year_df = pd.read_csv(year_data, sep='\t')

# title로부터 year 추출
print('# of items Before preprocessing:', raw_year_df['item'].nunique())

item_ids = set(raw_rating_df.loc[:,'item'])
no_year_items_ids = item_ids - set(raw_year_df.loc[:,'item'])

cond = raw_title_df['item'].isin(no_year_items_ids)
no_year_items = deepcopy(raw_title_df[cond])
no_year_items['year'] = no_year_items['title'].apply(lambda x:int(x[-5:-1])).values
raw_year_df = pd.concat([raw_year_df, no_year_items[['item', 'year']]], axis=0)

print('# of items After preprocessing:', raw_year_df['item'].nunique())

# of items Before preprocessing: 6799
# of items After preprocessing: 6807


In [10]:
year_dict = {year:i for i, year in enumerate(set(raw_year_df['year']))}
raw_year_df['year']  = raw_year_df['year'].map(lambda x : year_dict[x]) #year id로 변경
raw_year_df

Unnamed: 0,item,year
0,1348,7
1,44587,7
2,4768,7
3,8235,8
4,8609,8
...,...,...
2620,7243,2
2669,8511,3
2676,6988,4
3177,32898,0


In [11]:
#2-4. Director df 생성
director_data = data_dir + "directors.tsv"
raw_director_df = pd.read_csv(director_data, sep='\t')

In [12]:
#2-5. Writer df 생성
writer_data = data_dir + "writers.tsv"
raw_writer_df = pd.read_csv(writer_data, sep='\t')

In [13]:
# 4. Join dfs
joined_rating_df = pd.merge(raw_rating_df, raw_year_df, left_on='item', right_on='item', how='inner')
joined_rating_df = pd.merge(joined_rating_df, multi_hot_encoded, left_on='item', right_on='item', how='inner')
joined_rating_df = joined_rating_df.sort_values(by=['user'])
joined_rating_df.reset_index(drop=True, inplace=True)

data = joined_rating_df
data

Unnamed: 0,user,item,rating,year,Action,Adventure,Animation,Children,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,11,4643,1.0,86,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,11,60069,1.0,93,0,1,1,1,0,0,...,0,0,0,0,0,1,1,0,0,0
2,11,4492,1.0,71,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,11,2600,1.0,84,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
4,11,4571,1.0,74,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6722466,138493,260,1.0,62,1,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6722467,138493,1597,1.0,82,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
6722468,138493,1203,1.0,42,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6722469,138493,7149,1.0,88,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [14]:
data_dir = data_dir + 'context-aware/'
if not os.path.exists(data_dir) :
    os.mkdir(data_dir)
data.to_csv(os.path.join(data_dir, 'Ratings with Side-Information.csv'), mode='w', index=False)

In [15]:
# 5. user, item을 zero-based index로 mapping
users = list(set(data.loc[:,'user']))
users.sort()
items =  list(set((data.loc[:, 'item'])))
items.sort()

if len(users)-1 != max(users):
    users_dict = {users[i]: i for i in range(len(users))}
    data['user']  = data['user'].map(lambda x : users_dict[x])
    users = list(set(data.loc[:,'user']))
    
if len(items)-1 != max(items):
    items_dict = {items[i]: i for i in range(len(items))}
    data['item']  = data['item'].map(lambda x : items_dict[x])
    items =  list(set((data.loc[:, 'item'])))


In [81]:
import torch 

n_user = data.loc[:,'user'].nunique()
n_item = data.loc[:,'item'].nunique()
n_year = data.loc[:,'year'].nunique()
n_genre = 18

user_col = torch.tensor(np.array(data.loc[:,'user']))
item_col = torch.tensor(np.array(data.loc[:,'item']))
year_col = torch.tensor(np.array(data.loc[:,'year']))

multi_hot_cols = data.columns.difference(['user', 'item', 'year','rating'])
genre_col = torch.tensor(np.array(data.loc[:,multi_hot_cols]))

offsets = [0, n_user, n_user+n_item]
for col, offset in zip([user_col, item_col, year_col], offsets):
    col += offset

X = torch.cat([user_col.unsqueeze(1), item_col.unsqueeze(1), year_col.unsqueeze(1), genre_col], dim=1).long()
y = torch.tensor(np.array(data.loc[:,'rating'])).long()


In [84]:
import torch.nn as nn

embedding = nn.Embedding(n_user+n_item+n_year+n_genre, 10)
one_hot_x = X[:,:3]
multi_hot_x = X[:,3:]

In [97]:
import time

embed_x = embedding(one_hot_x)

sum_embed = []

start = time.time()
for mhx in multi_hot_x :
    genres = torch.where(mhx)
    embed_genres = embedding(genres[0] + n_user+n_item+n_year)
    sum_embed.append(torch.sum(embed_genres, axis=0).unsqueeze(0))
sum_embed = torch.cat(sum_embed, axis=0)

embed_x= torch.cat([embed_x, sum_embed.unsqueeze(1)], axis=1)
print(time.time() - start)