## Movielens - Freebase Processing
#### Create Triplets

In [1]:
import os, re
import numpy as np

# this is slow, use np.load() instead
kg_path = 'datasets/www_data/www_data/Movielens/kg/train.dat'
rec_path = 'datasets/www_data/www_data/Movielens/rs/ratings.txt'
kg = np.genfromtxt(kg_path, delimiter='\t', dtype=np.int32)
rec = np.genfromtxt(rec_path, delimiter='\t', dtype=np.int32)

rec = rec[:,:3] # remove time col.
rec[:,2] = rec[:,2] >= 4 # binary ratings, 0 if [0,3.5], 1 if [4, 5] 
rec = rec[rec[:,2] == 1] # select only positive ratings
rec[:,2] = 47 # set redundant col to relationship 47
rec = rec[:, [0,2,1]] # <user, likes, item> format

# checkpoint: user and item format are still in ml id's
# step 1: convert item id's first

In [2]:
TOTAL_FB_IDS = np.max(kg) # total number of default kg pairs (# rel << # entities)

# paths for converting data
item2kg_path = 'datasets/www_data/www_data/Movielens/rs/i2kg_map.tsv'
emap_path = 'datasets/www_data/www_data/Movielens/kg/e_map.dat'

# maps movie lense id's to free base html links
ml2fb_map = {}
with open(item2kg_path) as f:
    for line in f:
        ml_id = re.search('(.+?)\t', line)
        fb_http = re.search('\t(.+?)\n', line)
        
        ml2fb_map.update({int(ml_id.group(1)) : fb_http.group(1)})

# maps free base html links to free base id's (final format)
fb2id_map = {}
with open(emap_path) as f:
    for kg_id, line in enumerate(f):
        fb_http = re.search('\t(.+?)\n', line)
        
        fb2id_map.update({fb_http.group(1) : kg_id})

# convert movielens id's to freebase id's
new_ids = 0
for i in range(rec.shape[0]):
    if rec[i,2] in ml2fb_map: 
        # get correct freebase id from data
        fb_http = ml2fb_map[rec[i,2]]
        fb_id = fb2id_map[fb_http]
    else:
        # create new freebase id
        new_ids += 1
        fb_id = TOTAL_FB_IDS + new_ids

        # add information to maps (for repeat values) 
        ml2fb_map.update({rec[i,2] : '<http:dummy/{}'.format(new_ids)})
        fb2id_map.update({'<http:dummy/{}'.format(new_ids) : fb_id})
    rec[i,2] = fb_id

# checkpoint: now 'likes' and movies are in freebase id's
# step #2: convert user id's into freebase id's
NEW_MOVIE_IDS = new_ids

umap_path = 'datasets/www_data/www_data/Movielens/rs/u_map.dat'

# maps movielens user id's to freebase id's
userid2fbid_map = {}
new_ids = 0
with open(umap_path) as f:
    for line in f:

        ml_id = re.search('\t(.+?)\n', line)
        if int(ml_id.group(1)) in rec[:,0]:
            new_ids += 1
            userid2fbid_map.update({int(ml_id.group(1)) : TOTAL_FB_IDS + NEW_MOVIE_IDS + new_ids})

# convert movielens user id's into freebase id's
for i in range(rec.shape[0]):
    rec[i,0] = userid2fbid_map[rec[i,0]]

NEW_USER_IDS = new_ids

#### Make Dictionary for Triplet Lookup

In [3]:
# create dictionary which maps id's to readable text
fbid2word_map = {}

# add users
for i in range(NEW_USER_IDS):
    fbid2word_map.update({TOTAL_FB_IDS + NEW_MOVIE_IDS + i + 1 : 'User {}'.format(i)})

item_path = 'datasets/www_data/www_data/Movielens/rs/i_map.dat'
movie_path = 'datasets/www_data/www_data/Movielens/rs/movies.csv'

# converts short movielens ids to movielens id
movie_count = 0
shortml2ml_map = {}
with open(item_path) as f:
    for i, line in enumerate(f):
        ml_id = re.search('\t(.+?)\n', line)
        shortml2ml_map.update({i : int(ml_id.group(1))})
        movie_count += 1

shortml2movie_map = {}
with open(movie_path) as f:
    for i, line in enumerate(f):
        if i == 0: continue # skip first line
        movie = re.search(',(.+?),', line)
        shortml2movie_map.update({i-1 : movie.group(1)})

# add movies to dictionary
miss_list = []
for i in range(movie_count):
    ml_id = shortml2ml_map[i]
    if ml_id in ml2fb_map:
        fb_http = ml2fb_map[ml_id]
    else:
        # items not in dict because no ratings at 4 or above
        miss_list.append(ml_id)
    fb_id = fb2id_map[fb_http]
    movie = shortml2movie_map[i]

    fbid2word_map.update({fb_id : movie})
print('missed {} items'.format(len(miss_list)))


# relationship map
link_map = {}
link_path = 'datasets/www_data/www_data/Movielens/kg/r_map.dat'

# make dict of relationship vals
with open(link_path) as f:
    for i, line in enumerate(f):
        link = re.search('/film.film.(.+?)>', line)
        link_map.update({i : link.group(1).replace('_', ' ').capitalize()})
link_map.update({47 : 'Likes'}) # add likes relationship

missed 25 items


#### Create Dictionary of User Likes Items

In [4]:
user_likes_map = {}
for i in range(rec.shape[0]):
    if rec[i,0] not in user_likes_map:
        arr = [rec[i,2]]
        user_likes_map.update({rec[i,0]: arr})
    else:
        if rec[i,2] not in user_likes_map[rec[i,0]]:
            user_likes_map[rec[i,0]].append(rec[i,2])


In [5]:
import pickle

np.save('datasets/ML_FB/kg.npy', kg, allow_pickle=True)
np.save('datasets/ML_FB/rec.npy', rec, allow_pickle=True)

np.random.shuffle(rec)
split = int(0.7*rec.shape[0])
rec_train = rec[:split]
rec_test = rec[split:]

np.save('datasets/ML_FB/rec_train.npy', rec_train, allow_pickle=True)
np.save('datasets/ML_FB/rec_test.npy', rec_test, allow_pickle=True)
with open('datasets/ML_FB/item_map.pkl', 'wb') as f:
    pickle.dump(fbid2word_map, f)
with open('datasets/ML_FB/rel_map.pkl', 'wb') as f:
    pickle.dump(link_map, f)
with open('datasets/ML_FB/user_likes_map.pkl', 'wb') as f:
    pickle.dump(user_likes_map, f) 


In [3]:
import time
import numpy as np 

rec = np.load('datasets/ML_FB/rec.npy')
kg = np.load('datasets/ML_FB/kg.npy')
fin = np.concatenate((rec, kg), axis=0)

t = time.time()
np.random.shuffle(fin)
print(time.time() - t)


4.647126197814941


### Old Scripts

In [6]:
# move numpy array's to .txt files that can be read into Simple process
# we need train, test, valid
# ratio: [70, 20, 10]

#import numpy as np

#kg = np.load('data/kg.npy')
#rec = np.load('data/rec.npy')

#data = np.concatenate((kg,rec), axis=0)
##np.random.shuffle(data)

#train_end = int(0.7 * data.shape[0])
#test_end = int(0.2 * data.shape[0]) + train_end

#f = open('datasets/ML_KG/train.txt', 'w+')
#for i in range(train_end):
    #f.write('{}\t{}\t{}\n'.format(data[i,0], data[i,1], data[i,2]))
#f.close()

#f = open('datasets/ML_KG/test.txt', 'w+')
#for i in range(train_end, test_end):
    #f.write('{}\t{}\t{}\n'.format(data[i,0], data[i,1], data[i,2]))
#f.close()

#trig = 0
#f = open('datasets/ML_KG/valid.txt', 'w+')
#for i in range(test_end, data.shape[0]):
    #f.write('{}\t{}\t{}\n'.format(data[i,0], data[i,1], data[i,2]))
#f.close()

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/ML_KG/train.txt'