In [34]:
import numpy as np
import scipy as sp
import scipy.sparse
import scipy.special
import bisect
import random
import pandas as pd
import json
from copy import deepcopy

In [2]:
def zero_based_mapping(data) :
    with open('/opt/ml/movie-recommendation/data/train/zero_mapping.json', 'r') as f:
        dict_data= json.load(f)

    n_user = len(dict_data['user'])
    n_item = len(dict_data['item'])

    data['user']  = data['user'].map(lambda x : dict_data['user'][str(x)])
    data['item']  = data['item'].map(lambda x : dict_data['item'][str(x)])
    
    return data, n_user, n_item

In [3]:
def inverse_mapping(data):   
    with open('/opt/ml/movie-recommendation/data/train/zero_mapping.json', 'r') as f:
        dict_data= json.load(f)

    inv_user_map = {v:int(k) for k,v in dict_data['user'].items()}
    data['user'] = data['user'].map(lambda x : inv_user_map[x])

    inv_item_map = {v:int(k) for k,v in dict_data['item'].items()}
    data['item'] = data['item'].map(lambda x : inv_item_map[x])

    return data

In [4]:
data_dir = '/opt/ml/movie-recommendation/data/train/'

train_df = pd.read_csv(data_dir+'train_ratings.csv')

users = list(set(train_df.loc[:,'user']))
items =  list(set(train_df.loc[:, 'item']))

users_dict = {users[i]: i for i in range(len(users))}
items_dict = {items[i]: i for i in range(len(items))}

with open(data_dir+'zero_mapping.json', 'w') as f:
    json.dump({'user':users_dict, 'item':items_dict}, f)

In [5]:
train_df

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [6]:
data, n_user, n_item = zero_based_mapping(train_df)
data

Unnamed: 0,user,item,time
0,2,2820,1230782529
1,2,125,1230782534
2,2,362,1230782539
3,2,416,1230782542
4,2,1357,1230782563
...,...,...,...
5154466,3325,4908,1260209449
5154467,3325,2994,1260209482
5154468,3325,1740,1260209720
5154469,3325,4432,1260209726


In [None]:
data = inverse_mapping(data)
data

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [75]:
with open(data_dir+'zero_mapping.json', 'r') as f:
    dict_data= json.load(f)

genres_df = pd.read_csv(data_dir+"genres.tsv", sep="\t")
genres_df['item']  = genres_df['item'].map(lambda x : dict_data['item'][str(x)])

array, index = pd.factorize(genres_df["genre"])
genres_df["genre"] = array

In [76]:
genres_list = genres_df.groupby("item")["genre"].apply(list)
genres_list

item
0       [8, 12, 13, 5, 9]
1              [8, 13, 9]
2                  [5, 6]
3                     [1]
4                     [5]
              ...        
6802           [8, 12, 9]
6803            [2, 1, 7]
6804           [1, 11, 4]
6805           [1, 10, 4]
6806            [2, 1, 7]
Name: genre, Length: 6807, dtype: object

In [77]:
data_dir = '/opt/ml/movie-recommendation/data/train/'
rating_df = pd.read_csv(data_dir+'train_ratings.csv')

# Title df 생성
title_data = data_dir + "titles.tsv"
title_df = pd.read_csv(title_data, sep='\t')

# Year df 생성
year_data = data_dir + "years.tsv"
year_df = pd.read_csv(year_data, sep='\t')

# title로부터 year 추출
print('# of items Before preprocessing:', year_df['item'].nunique())

item_ids = set(rating_df.loc[:,'item'])
no_year_items_ids = item_ids - set(year_df.loc[:,'item'])

cond = title_df['item'].isin(no_year_items_ids)
no_year_items = deepcopy(title_df[cond])
no_year_items['year'] = no_year_items['title'].apply(lambda x:int(x[-5:-1])).values
year_df = pd.concat([year_df, no_year_items[['item', 'year']]], axis=0)

print('# of items After preprocessing:', year_df['item'].nunique())

# of items Before preprocessing: 6799
# of items After preprocessing: 6807


In [78]:
year_df['year'] = pd.cut(year_df['year'], list(range(1900, 2021, 10)))
year_dict = {year:i for i, year in enumerate(set(year_df['year']))}
year_df['year']  = year_df['year'].map(lambda x : year_dict[x]).astype('int64') #year id로 변경
year_df['item']  = year_df['item'].map(lambda x : dict_data['item'][str(x)])

In [79]:
year_list = year_df.groupby("item")["year"].apply(list)
year_list

item
0        [3]
1        [3]
2        [3]
3       [11]
4        [3]
        ... 
6802     [0]
6803     [4]
6804     [7]
6805     [3]
6806    [11]
Name: year, Length: 6807, dtype: object

In [80]:
(year_list+genres_list).apply(list).to_json(
    data_dir+'item2attributes.json'
)