In [2]:
import numpy as np
import tensorflow as tf
import time
import os
import math
import pandas as pd
from tqdm.notebook import tqdm
import scipy.sparse as sp
from tffm import TFFMClassifier
from tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error 
from sklearn.metrics import roc_auc_score, accuracy_score

group_size = 2
dir_ = '../data/'
folder = 'sp_matrix_tag_pop'
group_info = 'group_info/random'
file_name = 'normalized_to_rating_filter_track_5_user_50.pkl'

In [3]:
df = pd.read_pickle(os.path.join(dir_, folder, group_info, file_name))
train = pd.read_pickle(os.path.join(dir_, folder, 'train_' + file_name))
train_ = train[['uid', 'tid', 'rating', 'count','pop']]

In [4]:
train_x = sp.load_npz(os.path.join(dir_, folder, group_info, str(group_size) + '_train_x_' + file_name + '.npz'))
train_y = np.loadtxt(os.path.join(dir_, folder, 'train_y_' + file_name[:-3] + 'csv'), delimiter=',')
test_x_sp = sp.load_npz(os.path.join(dir_, folder, group_info, str(group_size) + '_test_x_' + file_name + '.npz'))
test_y_sp = np.loadtxt(os.path.join(dir_, folder, 'test_y_' + file_name[:-3] + 'csv'), delimiter=',')

In [5]:
train_x.shape, len(train_y), test_x_sp.shape, len(test_y_sp)

((362677, 39484), 362677, (90670, 39484), 90670)

In [6]:
unique_users = len(df['uid'].unique())
unique_tracks = len(df['tid'].unique())
l = train_x.shape[1]
tid_start = unique_users
tag_start = unique_users + unique_tracks
pop_start = unique_users + unique_tracks + 19
group_start = unique_users + unique_tracks + 19 + 1
print ('Tid start position : ' + str(tid_start))
print ('Tag start position : ' + str(tag_start))
print ('Length of vector : ' + str(l))
print('----------------------------------------')
print ('Number of unique users : ' + str(unique_users))
print ('Number of unique tracks : ' + str(unique_tracks))

Tid start position : 218
Tag start position : 39355
Length of vector : 39484
----------------------------------------
Number of unique users : 218
Number of unique tracks : 39137


In [55]:
track_ids = []
for i in range(unique_tracks):
    track_ids.append(i)

all_tracks = pd.DataFrame()
all_tracks['tid'] = track_ids
all_tracks['count'] = 0

In [56]:
x_test = []
for i in tqdm(range(unique_users)):
    user = train_[train_['uid']==i]
    top_n = all_tracks.set_index('tid').add(user.set_index('tid'), fill_value=0).reset_index()
    top_n = top_n[top_n['count']==0]
    top_n['uid'] = i
    top_n = top_n[['uid', 'tid']]
    top_n = top_n.values.tolist()
    x_test.extend(top_n)
    

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=218.0), HTML(value='')))




In [57]:
df[:5]

Unnamed: 0,uid,tid,count,tags,rating,pop,group_info_2,group_info_3,group_info_4,group_info_5
0,0,0,2,1,1.5,5,67,8.0,34.0,23.0
2,0,1,2,2,1.5,10,67,8.0,34.0,23.0
7,0,2,4,0,3.0,24,67,8.0,34.0,23.0
8,0,3,1,0,0.25,9,67,8.0,34.0,23.0
10,0,4,5,1,3.5,14,67,8.0,34.0,23.0


In [58]:
key = 'group_info_' + str(group_size)
i = 0
v = []
sp_rows = []
sp_cols = []
for row in tqdm(x_test):
    u = df[df['uid']==row[0]].reset_index()
    group_index = u[key].iloc[0]
    if not np.isnan(group_index):
        sp_rows.append(i)
        sp_cols.append(row[0])
        v.append(1)

        sp_rows.append(i)
        sp_cols.append(tid_start + row[1])
        v.append(1)

        t = df[df['tid']==row[1]].reset_index()
        tags = t['tags'].iloc[0]
        sp_rows.append(i)
        sp_cols.append(tag_start + tags)
        v.append(1)

        pop = t['pop'].iloc[0]
        sp_rows.append(i)
        sp_cols.append(pop_start)
        v.append(pop)

        sp_rows.append(i)
        sp_cols.append(group_start + group_index)
        v.append(1)

        i += 1

test_x = sp.csr_matrix((v, (sp_rows, sp_cols)), shape=(len(x_test), l), dtype=float)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8169189.0), HTML(value='')))




In [59]:
max(sp_cols), l

(39483, 39484)

In [60]:
sp.save_npz(os.path.join(dir_, folder, group_info, str(group_size) + '_topN_test_x_' + file_name), test_x)

In [8]:
test_x = sp.load_npz(os.path.join(dir_, folder, group_info, str(group_size) + '_topN_test_x_' + file_name + '.npz'))

In [9]:
print(test_x[1,:])

  (0, 0)	1.0
  (0, 232)	1.0
  (0, 39356)	1.0
  (0, 39374)	6.0
  (0, 39442)	1.0


In [10]:
order = 4
model = TFFMRegressor(
    order=order, 
    rank=10, 
    optimizer=tf.train.AdamOptimizer(learning_rate=0.00001), 
    n_epochs=10, 
    batch_size=16384,
    init_std=0.000001,
    reg=0.0001,
    input_type='sparse'
)
model.fit(train_x, train_y, show_progress=True)
predictions = model.predict(test_x)






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  return array(a, dtype, copy=False, order=order, subok=True)







100%|██████████| 10/10 [00:09<00:00,  1.02epoch/s]


In [11]:
len(predictions), predictions

(8169189,
 array([0.07580909, 0.02624193, 0.15652674, ..., 0.02398945, 0.02644402,
        0.02382247], dtype=float32))

In [85]:
df2 = pd.DataFrame(x_test, columns=['uid', 'tid'])
df2.insert(2, 'rating', predictions, False) 
df2[:10]

Unnamed: 0,uid,tid,rating
0,0,2,0.076074
1,0,14,0.026284
2,0,15,0.156699
3,0,20,0.042561
4,0,21,0.026392
5,0,24,0.073776
6,0,30,0.036931
7,0,44,0.037404
8,0,46,0.034948
9,0,48,0.245539


In [65]:
df2.to_pickle(os.path.join(dir_, folder, group_info, str(group_size) + '_topN_pred_' + file_name[:-3] + 'pkl'))

In [88]:
predictions = df2

In [89]:
test = pd.read_pickle(os.path.join(dir_, folder, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])[:10]

Unnamed: 0,uid,tid,count,tags,rating,pop
7,0,2,4,0,3.0,24
24,0,14,2,1,1.5,6
25,0,15,12,0,4.75,53
33,0,20,3,12,2.5,12
34,0,21,1,5,0.25,6
37,0,24,5,9,3.5,23
46,0,30,5,1,3.5,10
61,0,44,13,2,4.75,10
66,0,46,2,6,1.5,9
68,0,48,4,2,3.0,85


In [90]:
num_user = len(predictions['uid'].unique())
num_user

218

In [91]:
# Personal Recommendation
n=20
top_n_lists = []

for i in tqdm(range(num_user)):
    prediction = predictions[predictions['uid'] == i]
    prediction = prediction.sort_values(by=['rating'],  ascending=False)
    prediction = prediction[:n]
#     print(prediction)
    top_n_list = []
    for _, row in prediction.iterrows():
        top_n_list.append(row[1])
#     print(top_n_list)
    top_n_lists.append(top_n_list)

evaluation = []
satisfication = 0
for i in tqdm(range(num_user)):
    top_n_list = top_n_lists[i]
    precision = 0
    for j in top_n_list:
        p = test[test['uid'] == i]
        p = p[p['tid']==j]
        if len(p) > 0:
            precision += 1
    satisfication += precision / n
    evaluation.append(precision)
#     print(precision, satisfication)
print(satisfication/num_user)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=218.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=218.0), HTML(value='')))


0.16261467889908263


In [92]:
train.sort_values(by=['uid','tid'])[:10]

Unnamed: 0,uid,tid,count,tags,rating,pop
0,0,0,2,1,1.5,5
2,0,1,2,2,1.5,10
8,0,3,1,0,0.25,9
10,0,4,5,1,3.5,14
12,0,5,2,4,1.5,28
13,0,6,4,8,3.0,8
14,0,7,5,1,3.5,6
15,0,8,18,4,5.0,21
17,0,9,1,2,0.25,32
20,0,10,1,1,0.25,10
