In [76]:
import pickle
import numpy as np
import pandas as pd
import json
from copy import deepcopy

In [2]:
def zero_based_mapping(data) :
    with open('/opt/ml/movie-recommendation/data/train/zero_mapping.json', 'r') as f:
        dict_data= json.load(f)

    n_user = len(dict_data['user'])
    n_item = len(dict_data['item'])

    data['user']  = data['user'].map(lambda x : dict_data['user'][str(x)])
    data['item']  = data['item'].map(lambda x : dict_data['item'][str(x)])
    
    return data, n_user, n_item

In [3]:
def inverse_mapping(data):   
    with open('/opt/ml/movie-recommendation/data/train/zero_mapping.json', 'r') as f:
        dict_data= json.load(f)

    inv_user_map = {v:int(k) for k,v in dict_data['user'].items()}
    data['user'] = data['user'].map(lambda x : inv_user_map[x])

    inv_item_map = {v:int(k) for k,v in dict_data['item'].items()}
    data['item'] = data['item'].map(lambda x : inv_item_map[x])

    return data

In [4]:
data_dir = '/opt/ml/movie-recommendation/data/train/'

train_df = pd.read_csv(data_dir+'train_ratings.csv')

users = list(set(train_df.loc[:,'user']))
items =  list(set(train_df.loc[:, 'item']))

users_dict = {users[i]: i for i in range(len(users))}
items_dict = {items[i]: i for i in range(len(items))}

with open(data_dir+'zero_mapping.json', 'w') as f:
    json.dump({'user':users_dict, 'item':items_dict}, f)

In [5]:
train_df

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [6]:
data, n_user, n_item = zero_based_mapping(train_df)
data

Unnamed: 0,user,item,time
0,2,2820,1230782529
1,2,125,1230782534
2,2,362,1230782539
3,2,416,1230782542
4,2,1357,1230782563
...,...,...,...
5154466,3325,4908,1260209449
5154467,3325,2994,1260209482
5154468,3325,1740,1260209720
5154469,3325,4432,1260209726


In [7]:
data = inverse_mapping(data)
data

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [8]:
with open(data_dir+'zero_mapping.json', 'r') as f:
    dict_data= json.load(f)

genres_df = pd.read_csv(data_dir+"genres.tsv", sep="\t")
genres_df['item']  = genres_df['item'].map(lambda x : dict_data['item'][str(x)])

array, index = pd.factorize(genres_df["genre"])
genres_df["genre"] = array

In [9]:
genres_list = genres_df.groupby("item")["genre"].apply(list)
genres_list

item
0       [8, 12, 13, 5, 9]
1              [8, 13, 9]
2                  [5, 6]
3                     [1]
4                     [5]
              ...        
6802           [8, 12, 9]
6803            [2, 1, 7]
6804           [1, 11, 4]
6805           [1, 10, 4]
6806            [2, 1, 7]
Name: genre, Length: 6807, dtype: object

In [10]:
data_dir = '/opt/ml/movie-recommendation/data/train/'
rating_df = pd.read_csv(data_dir+'train_ratings.csv')

# Title df 생성
title_data = data_dir + "titles.tsv"
title_df = pd.read_csv(title_data, sep='\t')

# Year df 생성
year_data = data_dir + "years.tsv"
year_df = pd.read_csv(year_data, sep='\t')

# title로부터 year 추출
print('# of items Before preprocessing:', year_df['item'].nunique())

item_ids = set(rating_df.loc[:,'item'])
no_year_items_ids = item_ids - set(year_df.loc[:,'item'])

cond = title_df['item'].isin(no_year_items_ids)
no_year_items = deepcopy(title_df[cond])
no_year_items['year'] = no_year_items['title'].apply(lambda x:int(x[-5:-1])).values
year_df = pd.concat([year_df, no_year_items[['item', 'year']]], axis=0)

print('# of items After preprocessing:', year_df['item'].nunique())

# of items Before preprocessing: 6799
# of items After preprocessing: 6807


In [11]:
year_df['year'] = pd.cut(year_df['year'], list(range(1900, 2021, 10)))
year_dict = {year:i for i, year in enumerate(set(year_df['year']))}
year_df['year']  = year_df['year'].map(lambda x : year_dict[x]).astype('int64') #year id로 변경
year_df['item']  = year_df['item'].map(lambda x : dict_data['item'][str(x)])

In [12]:
year_list = year_df.groupby("item")["year"].apply(list)
year_list

item
0        [4]
1        [4]
2        [4]
3        [1]
4        [4]
        ... 
6802     [3]
6803     [7]
6804    [10]
6805     [4]
6806     [1]
Name: year, Length: 6807, dtype: object

In [13]:
(year_list+genres_list).apply(list).to_json(
    data_dir+'item2attributes.json'
)

In [66]:
m2v_dir = '/opt/ml/movie-recommendation/data/train/m2v/'

with open(m2v_dir+'m2v_item_index.pkl', 'rb') as f :
    m2v_item_index = pickle.load(f)

m2v_item_index

{'i8973': 0,
 'i1097': 1,
 'i3363': 2,
 'i260': 3,
 'i3504': 4,
 'i82463': 5,
 'i5377': 6,
 'i97921': 7,
 'i1080': 8,
 'i1240': 9,
 'i4235': 10,
 'i2628': 11,
 'i924': 12,
 'i926': 13,
 'i3468': 14,
 'i1099': 15,
 'i73681': 16,
 'i3114': 17,
 'i8949': 18,
 'i7361': 19,
 'i5459': 20,
 'i3681': 21,
 'i2137': 22,
 'i3300': 23,
 'i44694': 24,
 'i5015': 25,
 'i1206': 26,
 'i2321': 27,
 'i2003': 28,
 'i527': 29,
 'i7439': 30,
 'i4771': 31,
 'i80969': 32,
 'i1230': 33,
 'i1223': 34,
 'i2788': 35,
 'i86644': 36,
 'i91658': 37,
 'i1227': 38,
 'i2729': 39,
 'i1748': 40,
 'i485': 41,
 'i474': 42,
 'i6536': 43,
 'i1196': 44,
 'i50872': 45,
 'i1197': 46,
 'i5902': 47,
 'i32587': 48,
 'i62': 49,
 'i3269': 50,
 'i7247': 51,
 'i2580': 52,
 'i48780': 53,
 'i44191': 54,
 'i33437': 55,
 'i912': 56,
 'i2762': 57,
 'i1204': 58,
 'i73017': 59,
 'i3054': 60,
 'i1682': 61,
 'i6754': 62,
 'i39292': 63,
 'i232': 64,
 'i4857': 65,
 'i2671': 66,
 'i61697': 67,
 'i40819': 68,
 'i2764': 69,
 'i8810': 70,
 'i2232': 

In [67]:
zm_dir = '/opt/ml/movie-recommendation/data/train/'

with open(zm_dir+'zero_mapping.json', 'r') as f:
    dict_data= json.load(f)

zm_dict = dict_data['item']
zm_dict

{'1': 0,
 '2': 1,
 '3': 2,
 '32770': 3,
 '5': 4,
 '6': 5,
 '7': 6,
 '4': 7,
 '8': 8,
 '10': 9,
 '11': 10,
 '12': 11,
 '13': 12,
 '14': 13,
 '15': 14,
 '16': 15,
 '17': 16,
 '18': 17,
 '19': 18,
 '20': 19,
 '21': 20,
 '22': 21,
 '23': 22,
 '24': 23,
 '25': 24,
 '26': 25,
 '27': 26,
 '28': 27,
 '29': 28,
 '32792': 29,
 '31': 30,
 '32': 31,
 '30': 32,
 '34': 33,
 '36': 34,
 '39': 35,
 '41': 36,
 '42': 37,
 '65577': 38,
 '44': 39,
 '45': 40,
 '46': 41,
 '47': 42,
 '48': 43,
 '43': 44,
 '50': 45,
 '9': 46,
 '52': 47,
 '65588': 48,
 '65585': 49,
 '98361': 50,
 '58': 51,
 '57': 52,
 '60': 53,
 '65596': 54,
 '62': 55,
 '63': 56,
 '61': 57,
 '65': 58,
 '65601': 59,
 '69': 60,
 '70': 61,
 '72': 62,
 '73': 63,
 '74': 64,
 '32840': 65,
 '65612': 66,
 '76': 67,
 '78': 68,
 '79': 69,
 '80': 70,
 '81': 71,
 '82': 72,
 '32853': 73,
 '85': 74,
 '86': 75,
 '88': 76,
 '89': 77,
 '92': 78,
 '94': 79,
 '95': 80,
 '97': 81,
 '100': 82,
 '101': 83,
 '104': 84,
 '105': 85,
 '65642': 86,
 '107': 87,
 '110': 88

In [70]:
# v: np array index / zm_dict[k[1:]]: zero mapping index
items_dict = {v : zm_dict[k[1:]] for k,v in m2v_item_index.items()}
items_dict

{0: 4763,
 1: 704,
 2: 2095,
 3: 180,
 4: 2169,
 5: 5359,
 6: 3202,
 7: 6770,
 8: 688,
 9: 798,
 10: 2597,
 11: 1656,
 12: 580,
 13: 582,
 14: 2148,
 15: 705,
 16: 4456,
 17: 1960,
 18: 4743,
 19: 4194,
 20: 3245,
 21: 2266,
 22: 1354,
 23: 2067,
 24: 4952,
 25: 3041,
 26: 765,
 27: 1463,
 28: 1251,
 29: 360,
 30: 4222,
 31: 2893,
 32: 5223,
 33: 789,
 34: 782,
 35: 1744,
 36: 5619,
 37: 6051,
 38: 786,
 39: 1714,
 40: 1097,
 41: 332,
 42: 324,
 43: 3725,
 44: 756,
 45: 5430,
 46: 757,
 47: 3444,
 48: 6792,
 49: 55,
 50: 2053,
 51: 4144,
 52: 1631,
 53: 5273,
 54: 4914,
 55: 440,
 56: 569,
 57: 1730,
 58: 764,
 59: 4242,
 60: 1915,
 61: 1065,
 62: 3843,
 63: 3720,
 64: 165,
 65: 2943,
 66: 1674,
 67: 6482,
 68: 4432,
 69: 1732,
 70: 4685,
 71: 1412,
 72: 907,
 73: 234,
 74: 197,
 75: 2600,
 76: 3950,
 77: 4737,
 78: 901,
 79: 1866,
 80: 4611,
 81: 4000,
 82: 2015,
 83: 3501,
 84: 5209,
 85: 3497,
 86: 3911,
 87: 6225,
 88: 4578,
 89: 531,
 90: 3902,
 91: 1077,
 92: 1877,
 93: 4744,
 94

In [71]:
with open(m2v_dir+'m2v_item2index.pkl', 'wb') as f :
    pickle.dump(items_dict, f, pickle.HIGHEST_PROTOCOL)

In [79]:
with open(m2v_dir+'m2v_item2index.pkl', 'rb') as f :
    m2v_item2index = pickle.load(f)

with open(m2v_dir+'m2v_item_emb.pkl', 'rb') as f :
    m2v_item_emb = pickle.load(f)

attributes = np.zeros(shape=m2v_item_emb.shape)
for np_index, item_id in m2v_item2index.items():
    attributes[item_id] = m2v_item_emb[np_index]
attributes

array([[-0.08177002, -0.04936264, -0.03782342, ..., -0.1003268 ,
         0.00399167,  0.0186739 ],
       [ 0.04755069, -0.05834424, -0.05187849, ...,  0.03162548,
         0.00377283,  0.03101734],
       [-0.00716188, -0.1516542 ,  0.01572307, ...,  0.06037275,
         0.02482797, -0.21307305],
       ...,
       [ 0.02011959, -0.07221279,  0.07674994, ..., -0.02073245,
        -0.01061739,  0.08828346],
       [ 0.02418659, -0.02740957, -0.00613704, ...,  0.10385066,
         0.20321593, -0.01071892],
       [ 0.13241501,  0.15095542,  0.03133308, ...,  0.01400474,
         0.09364528, -0.04264626]])