In [1]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm

In [2]:
data_dir = '/opt/ml/movie-recommendation/data/'
train_df = pd.read_csv(os.path.join(data_dir+'train', 'train_ratings.csv')) # 전체 학습 데이터

## Top 10%

In [3]:
# most popular 10 items
num_items = train_df['item'].nunique()
num_users = train_df['user'].nunique()
most_popular_items = train_df['item'].value_counts()[:int(num_items*0.1)].index.values

most_popular_items = set(most_popular_items)
observed_items_per_user = list(train_df.groupby('user')['item'])


In [4]:
unseen_items_dfs = list()

for user, observed_items in tqdm(observed_items_per_user):
    observed_items = set(observed_items)
    unseen_item = list(most_popular_items - observed_items)

    user_id = [user]*len(unseen_item)
    unseen_items_dfs.append(pd.DataFrame(zip(user_id,unseen_item), columns=['user','item']))

test_df = pd.concat(unseen_items_dfs, axis = 0, sort=False)
test_df
    

100%|██████████| 31360/31360 [00:29<00:00, 1047.52it/s]


Unnamed: 0,user,item
0,11,2
1,11,6
2,11,2054
3,11,4105
4,11,10
...,...,...
515,138493,30707
516,138493,4085
517,138493,26614
518,138493,55290


In [5]:
genre_data = data_dir+'train/' + "genres.tsv"

genre_df = pd.read_csv(genre_data, sep='\t')
genre_df = genre_df.drop_duplicates(subset=['item']) #item별 하나의 장르만 남도록 drop

genre_dict = {genre:i for i, genre in enumerate(set(genre_df['genre']))}
genre_df['genre']  = genre_df['genre'].map(lambda x : genre_dict[x]) #gen

In [6]:
test_df = pd.merge(test_df, genre_df, left_on='item', right_on='item', how='inner')
test_df

Unnamed: 0,user,item,genre
0,11,2,13
1,14,2,13
2,18,2,13
3,25,2,13
4,31,2,13
...,...,...,...
18182955,138470,356,17
18182956,138471,356,17
18182957,138473,356,17
18182958,138475,356,17


In [7]:
test_df.to_csv(data_dir+'eval/top10_unseen_items.csv')