In [1]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

In [2]:
data_dir = '/opt/ml/movie-recommendation/data/'
train_df = pd.read_csv(os.path.join(data_dir+'train', 'train_ratings.csv')) # 전체 학습 데이터

## Top 10%

In [3]:
# most popular 10 items
num_items = train_df['item'].nunique()
num_users = train_df['user'].nunique()
most_popular_items = train_df['item'].value_counts()[:int(num_items*0.1)].index.values

most_popular_items = set(most_popular_items)
observed_items_per_user = list(train_df.groupby('user')['item'])


In [4]:
unseen_items_dfs = list()

for user, observed_items in tqdm(observed_items_per_user):
    observed_items = set(observed_items)
    unseen_item = list(most_popular_items - observed_items)

    user_id = [user]*len(unseen_item)
    unseen_items_dfs.append(pd.DataFrame(zip(user_id,unseen_item), columns=['user','item']))

test_df = pd.concat(unseen_items_dfs, axis = 0, sort=False)

100%|██████████| 31360/31360 [00:30<00:00, 1042.50it/s]


In [5]:
genre_data = data_dir+'train/' + "genres.tsv"

genre_df = pd.read_csv(genre_data, sep='\t')
genre_df = genre_df.drop_duplicates(subset=['item']) #item별 하나의 장르만 남도록 drop

genre_dict = {genre:i for i, genre in enumerate(set(genre_df['genre']))}
genre_df['genre']  = genre_df['genre'].map(lambda x : genre_dict[x]) #gen

In [6]:
test_df = pd.merge(test_df, genre_df, left_on='item', right_on='item', how='inner')

# 5. user, item을 zero-based index로 mapping
users = list(set(train_df.loc[:,'user']))
users.sort()
items =  list(set((train_df.loc[:, 'item'])))
items.sort()

if len(users)-1 != max(users):
    users_dict = {users[i]: i for i in range(len(users))}
    test_df['user']  = test_df['user'].map(lambda x : users_dict[x])
    
if len(items)-1 != max(items):
    items_dict = {items[i]: i for i in range(len(items))}
    test_df['item']  = test_df['item'].map(lambda x : items_dict[x])

test_df = test_df.sort_values(by=['user'])
test_df.reset_index(drop=True, inplace=True)

In [7]:
test_df.to_csv(data_dir+'eval/top10_unseen_items.csv', index=False)

In [8]:
submission = pd.read_csv('/opt/ml/movie-recommendation/DeepFM/output/output.csv')

inv_user_map = {v: k for k, v in users_dict.items()}
submission['user']  = submission['user'].map(lambda x : inv_user_map[x])

inv_item_map = {v: k for k, v in items_dict.items()}
submission['item'] = submission['item'].map(lambda x : inv_item_map[x])

In [10]:
submission.to_csv('/opt/ml/movie-recommendation/DeepFM/output/output.csv', index=False)