## LastFM - Freebase Processing
#### Create Triplets

In [6]:
import os, re
import numpy as np

# this is slow, use np.load() instead
kg_path = 'datasets/www_data/www_data/LastFM/kg/train.dat'
rec_path = 'datasets/www_data/www_data/LastFM/rs/ratings.txt'
kg = np.genfromtxt(kg_path, delimiter='\t', dtype=np.int32)
rec = np.genfromtxt(rec_path, delimiter='\t', dtype=np.int32)

# mean listens is 11, 6 keeps 3/4 of reviews
rec[:,2] = rec[:,2] >= 6 # binary ratings, 0 if < avg, 1 if > avg 
rec = rec[rec[:,2] == 1] # select only positive ratings
rec[:,2] = 47 # set redundant col to relationship 47
rec = rec[:, [0,2,1]] # <user, likes, item> format

In [5]:
TOTAL_FB_IDS = np.max(kg) # total number of default kg pairs (# rel << # entities)

# paths for converting data
item2kg_path = 'datasets/www_data/www_data/Movielens/rs/i2kg_map.tsv'
emap_path = 'datasets/www_data/www_data/Movielens/kg/e_map.dat'

# maps lastfm id's to free base html links
ml2fb_map = {}
with open(item2kg_path) as f:
    for line in f:
        ml_id = re.search('(.+?)\t', line)
        fb_http = re.search('\t(.+?)\n', line)
        
        ml2fb_map.update({int(ml_id.group(1)) : fb_http.group(1)})

# maps free base html links to free base id's (final format)
fb2id_map = {}
with open(emap_path) as f:
    for kg_id, line in enumerate(f):
        fb_http = re.search('\t(.+?)\n', line)
        
        fb2id_map.update({fb_http.group(1) : kg_id})

# convert lastfm id's to freebase id's
new_ids = 0
for i in range(rec.shape[0]):
    if rec[i,2] in ml2fb_map: 
        # get correct freebase id from data
        fb_http = ml2fb_map[rec[i,2]]
        fb_id = fb2id_map[fb_http]
    else:
        # create new freebase id
        new_ids += 1
        fb_id = TOTAL_FB_IDS + new_ids

        # add information to maps (for repeat values) 
        ml2fb_map.update({rec[i,2] : '<http:dummy/{}'.format(new_ids)})
        fb2id_map.update({'<http:dummy/{}'.format(new_ids) : fb_id})
    rec[i,2] = fb_id

# checkpoint: now 'likes' and songs/artists (idk) are in freebase id's
# step #2: convert user id's into freebase id's
NEW_MOVIE_IDS = new_ids
umap_path = 'datasets/www_data/www_data/Movielens/rs/u_map.dat'

# maps lastfm user id's to freebase id's
userid2fbid_map = {}
new_ids = 0
with open(umap_path) as f:
    for line in f:

        ml_id = re.search('\t(.+?)\n', line)
        if int(ml_id.group(1)) in rec[:,0]:
            new_ids += 1
            userid2fbid_map.update({int(ml_id.group(1)) : TOTAL_FB_IDS + NEW_MOVIE_IDS + new_ids})

# convert movielens user id's into freebase id's
for i in range(rec.shape[0]):
    rec[i,0] = userid2fbid_map[rec[i,0]]


(4464456, 4)
(3475807, 3)
