In [1]:
import pandas as pd
import numpy as np
import os
import json

In [2]:
data_path = './data/train'
train_df = pd.read_csv(os.path.join(data_path, 'train_ratings.csv')) # 전체 학습 데이터
item_list = train_df['item'].unique() # 아이템 고유 번호 리스트

In [3]:
year_data = pd.read_csv(os.path.join(data_path, 'years.tsv'), sep='\t')
writer_data = pd.read_csv(os.path.join(data_path, 'writers.tsv'), sep='\t')
title_data = pd.read_csv(os.path.join(data_path, 'titles.tsv'), sep='\t')
genre_data = pd.read_csv(os.path.join(data_path, 'genres.tsv'), sep='\t')
director_data = pd.read_csv(os.path.join(data_path, 'directors.tsv'), sep='\t')

In [4]:
# 영화 데이터 10년으로 구분하자
year_data.year = (year_data.year//10)*10
print(year_data)

        item  year
0       1348  1920
1      44587  1920
2       4768  1920
3       8235  1920
4       8609  1920
...      ...   ...
6794  114795  2010
6795  110771  2010
6796  112804  2010
6797  113378  2010
6798  109850  2010

[6799 rows x 2 columns]


In [5]:
# zero to n embedding
year2n_dict = {year: n for n, year in enumerate(year_data['year'].unique())}
writer2n_dict = {writer: n for n, writer in enumerate(writer_data['writer'].unique())}
genre2n_dict = {genre: n for n, genre in enumerate(genre_data['genre'].unique())}
director2n_dict = {director: n for n, director in enumerate(director_data['director'].unique())}

genre_data['genre'].replace(genre2n_dict, inplace=True)
year_data['year'].replace(year2n_dict, inplace=True)
director_data['director'].replace(director2n_dict, inplace=True)
writer_data['writer'].replace(writer2n_dict, inplace=True)

# set offset
year_data['year'] += len(genre2n_dict)
director_data['director'] += len(genre2n_dict) + len(year2n_dict)
writer_data['writer'] += len(genre2n_dict) + len(year2n_dict) + len(director2n_dict)

In [6]:
lines_genre_data = genre_data.groupby('item')['genre'].apply(list).reset_index()
lines_year_data = year_data.groupby('item')['year'].apply(list).reset_index()
lines_director_data = director_data.groupby('item')['director'].apply(list).reset_index()    
lines_writer_data = writer_data.groupby('item')['writer'].apply(list).reset_index()    

item_list = pd.DataFrame(item_list, columns=['item'])

item_list = item_list.merge(lines_genre_data, how= 'left', on='item')
item_list = item_list.merge(lines_year_data, how= 'left', on='item')
item_list = item_list.merge(lines_director_data, how= 'left', on='item')
item_list = item_list.merge(lines_writer_data, how= 'left', on='item')
item_list.fillna(0, inplace=True)

In [7]:
item_list['genre'] = list(map(lambda x: x if x != 0 else [], item_list['genre']))
item_list['year'] = list(map(lambda x: x if x != 0 else [], item_list['year']))
item_list['director'] = list(map(lambda x: x if x != 0 else [], item_list['director']))
item_list['writer'] = list(map(lambda x: x if x != 0 else [], item_list['writer']))

In [8]:
item2attributes_df = pd.DataFrame()
item2attributes_df['item'] = item_list['item'].copy()
item2attributes_df['attr'] = item_list['genre'] + item_list['year'] #+ item_list['director'] + item_list['writer']
item2attributes_df.sort_values('item', inplace=True)
item2attributes_df['item'] = item_list['item'].apply(str)
item2attributes = item2attributes_df.set_index('item').to_dict()['attr']

In [9]:
item2attributes

{'1': [8, 12, 13, 5, 9, 25],
 '2': [8, 13, 9, 25],
 '3': [5, 6, 25],
 '4': [5, 1, 6, 25],
 '5': [5, 25],
 '6': [2, 0, 4, 25],
 '7': [5, 6, 25],
 '8': [8, 13, 25],
 '9': [2, 25],
 '10': [2, 8, 4, 25],
 '11': [5, 1, 6, 25],
 '12': [5, 10, 25],
 '13': [8, 12, 13, 25],
 '14': [1, 25],
 '15': [2, 8, 6, 25],
 '16': [0, 1, 25],
 '17': [1, 6, 25],
 '18': [5, 25],
 '19': [5, 25],
 '20': [2, 5, 0, 1, 4, 25],
 '21': [5, 0, 4, 25],
 '22': [0, 1, 10, 11, 4, 25],
 '23': [2, 0, 4, 25],
 '24': [1, 3, 25],
 '25': [1, 6, 25],
 '26': [1, 25],
 '27': [13, 1, 25],
 '28': [1, 6, 25],
 '29': [8, 1, 9, 11, 3, 25],
 '30': [0, 1, 25],
 '31': [1, 25],
 '32': [11, 3, 4, 25],
 '34': [13, 1, 25],
 '36': [0, 1, 25],
 '39': [5, 6, 25],
 '41': [1, 7, 25],
 '42': [2, 0, 1, 25],
 '43': [1, 25],
 '44': [2, 8, 9, 25],
 '45': [5, 1, 4, 25],
 '46': [1, 6, 25],
 '47': [11, 4, 25],
 '48': [12, 13, 1, 15, 6, 25],
 '50': [0, 11, 4, 25],
 '52': [5, 1, 6, 25],
 '57': [1, 25],
 '58': [5, 1, 6, 25],
 '60': [8, 13, 9, 25],
 '61': [1

In [10]:
with open(os.path.join(data_path, 'Ml_item2attributes-0.json'), 'w') as fp:
    json.dump(item2attributes, fp)