In [2]:
import os
import re
import numpy as np

# this is slow, use np.load() instead
kg_path = 'datasets/www_data/www_data/Movielens/kg/train.dat'
rec_path = 'datasets/www_data/www_data/Movielens/rs/ratings.txt'
kg = np.genfromtxt(kg_path, delimiter='\t', dtype=np.uint32)
rec = np.genfromtxt(rec_path, delimiter='\t', dtype=np.uint32)

rec = rec[:,:3] # remove time col.
rec[:,2] = rec[:,2] >= 4 # binary ratings, 0 if [0,3.5], 1 if [4, 5] 
rec = rec[rec[:,2] == 1] # select only positive ratings
rec[:,2] = 47 # set redundant col to relationship 47
rec = rec[:, [0,2,1]] # <user, likes, item> format

# checkpoint: user and item format are still in ml id's
# step 1: convert item id's first

# save to np arrays for fast loading
os.makedirs('data', exist_ok=True)
np.save('data/kg.npy', kg, allow_pickle=True)
np.save('data/rec.npy', rec, allow_pickle=True)


In [3]:
import re
import numpy as np

TOTAL_FB_IDS = 52536 # total number of default kg pairs
rec = np.load('data/rec.npy', allow_pickle=True)

# paths for converting data
item2kg_path = 'datasets/www_data/www_data/Movielens/rs/i2kg_map.tsv'
emap_path = 'datasets/www_data/www_data/Movielens/kg/e_map.dat'

# maps movie lense id's to free base html links
ml2fb_map = {}
with open(item2kg_path) as f:
    for line in f:
        ml_id = re.search('(.+?)\t', line)
        fb_http = re.search('\t(.+?)\n', line)
        
        ml2fb_map.update({int(ml_id.group(1)) : fb_http.group(1)})

# maps free base html links to free base id's (final format)
fb2id_map = {}
with open(emap_path) as f:
    for kg_id, line in enumerate(f):
        fb_http = re.search('\t(.+?)\n', line)
        
        fb2id_map.update({fb_http.group(1) : kg_id})

# convert movielens id's to freebase id's
new_ids = 0
for i in range(rec.shape[0]):
    if rec[i,2] in ml2fb_map: 
        # get correct freebase id from data
        fb_http = ml2fb_map[rec[i,2]]
        fb_id = fb2id_map[fb_http]
    else:
        # create new freebase id
        new_ids += 1
        fb_id = TOTAL_FB_IDS + new_ids

        # add information to maps (for repeat values) 
        ml2fb_map.update({rec[i,2] : '<http:dummy/{}'.format(new_ids)})
        fb2id_map.update({'<http:dummy/{}'.format(new_ids) : fb_id})
    rec[i,2] = fb_id

# checkpoint: now 'likes' and movies are in freebase id's
# step #2: convert user id's into freebase id's
NEW_MOVIE_IDS = new_ids
umap_path = 'datasets/www_data/www_data/Movielens/rs/u_map.dat'

# maps movielens user id's to freebase id's
userid2fbid_map = {}
new_ids = 0
with open(umap_path) as f:
    for line in f:
        new_ids += 1
        ml_id = re.search('\t(.+?)\n', line)
        userid2fbid_map.update({int(ml_id.group(1)) : TOTAL_FB_IDS + NEW_MOVIE_IDS + new_ids})
        
# convert movielens user id's into freebase id's
for i in range(rec.shape[0]):
    rec[i,0] = userid2fbid_map[rec[i,0]]

NEW_USER_IDS = new_ids
print(rec[:5])


[[52811    47 12262]
 [52811    47  2918]
 [52811    47 10554]
 [52811    47 12030]
 [52811    47  8640]]


In [4]:
fbid2word_map = {}

# add users
for i in range(NEW_USER_IDS):
    fbid2word_map.update({TOTAL_FB_IDS + NEW_MOVIE_IDS + i + 1 : 'User {}'.format(i)})


item_path = 'datasets/www_data/www_data/Movielens/rs/i_map.dat'
movie_path = 'datasets/www_data/www_data/Movielens/rs/movies.csv'

# TODO: check this code... could be problem in dataset...
# converts movielens ids to movielens id
movie_count = 0
shortml2ml_map = {}
with open(item_path) as f:
    for i, line in enumerate(f):
        ml_id = re.search('\t(.+?)\n', line)
        shortml2ml_map.update({i : int(ml_id.group(1))})
        movie_count += 1

shortml2movie_map = {}
with open(movie_path) as f:
    for i, line in enumerate(f):
        if i == 0: continue # skip first line
        movie = re.search(',(.+?),', line)
        shortml2movie_map.update({i-1 : movie.group(1)})

# add movies to dictionary
miss_list = []
for i in range(movie_count):
    ml_id = shortml2ml_map[i]
    if ml_id in ml2fb_map:
        fb_http = ml2fb_map[ml_id]
    else:
        # items not in dict because no ratings at 4 or above
        miss_list.append(ml_id)
    fb_id = fb2id_map[fb_http]
    movie = shortml2movie_map[i]

    fbid2word_map.update({fb_id : movie})
print('missed {} items'.format(len(miss_list)))


missed 25 items


In [7]:
import pickle

# save for fast loading
np.save('data/rec.npy', rec, allow_pickle=True)
with open('data/dict.pkl', 'wb') as f:
    pickle.dump(fbid2word_map, f)
