In [1]:
import os
from fastFM.datasets import make_user_item_regression
from fastFM import als
from fastFM import sgd
from fastFM import mcmc
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

dir_ = '../data/'
folder = 'sp_matrix'
file_name = 'normalized_to_rating_filter_track_5_user_50.pkl'

In [2]:
df = pd.read_pickle(os.path.join(dir_, file_name))
train = pd.read_pickle(os.path.join(dir_, 'train_' + file_name))
train_ = train[['uid', 'tid', 'rating', 'count']]

In [3]:
train_x = sp.load_npz(os.path.join(dir_, folder, 'train_x_' + file_name + '.npz'))
train_y = np.loadtxt(os.path.join(dir_, folder, 'train_y_' + file_name[:-3] + 'csv'), delimiter=',')
test_x_sp = sp.load_npz(os.path.join(dir_, folder, 'test_x_' + file_name + '.npz'))
test_y_sp = np.loadtxt(os.path.join(dir_, folder, 'test_y_' + file_name[:-3] + 'csv'), delimiter=',')

In [4]:
print('training set : ', train_x.shape)
print('test set : ', test_x_sp.shape)

training set :  (362677, 39355)
test set :  (90670, 39355)


In [5]:
unique_users = len(df['uid'].unique())
unique_tracks = len(df['tid'].unique())
l = unique_users + unique_tracks #+ 19
tid_start = unique_users
tag_start = unique_users + unique_tracks
print ('Tid start position : ' + str(tid_start))
print ('Tag start position : ' + str(tag_start))
print ('Length of vector : ' + str(l))
print('----------------------------------------')
print ('Number of unique users : ' + str(unique_users))
print ('Number of unique tracks : ' + str(unique_tracks))

Tid start position : 218
Tag start position : 39355
Length of vector : 39355
----------------------------------------
Number of unique users : 218
Number of unique tracks : 39137


In [6]:
track_ids = []
for i in range(unique_tracks):
    track_ids.append(i)

all_tracks = pd.DataFrame()
all_tracks['tid'] = track_ids
all_tracks['count'] = 0

In [7]:
x_test = []
for i in tqdm(range(unique_users)):
    user = train_[train_['uid']==i]
    top_n = all_tracks.set_index('tid').add(user.set_index('tid'), fill_value=0).reset_index()
    top_n = top_n[top_n['count']==0]
    top_n['uid'] = i
    top_n = top_n[['uid', 'tid']]
    top_n = top_n.values.tolist()
    x_test.extend(top_n)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=218.0), HTML(value='')))




In [8]:
i = 0
sp_rows = []
sp_cols = []
for row in tqdm(x_test):
    sp_rows.append(i)
    sp_cols.append(row[0])
    
    sp_rows.append(i)
    sp_cols.append(tid_start + row[1])
    
#     for tag in row['tags']:
#         sp_rows.append(i)
#         sp_cols.append(tag_start + tag)
        
    i += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8169189.0), HTML(value='')))




In [9]:
w = np.ones(len(sp_rows))
test_x = sp.csr_matrix((w, (sp_rows, sp_cols)), shape=(len(x_test), l), dtype=float)

In [8]:
test_x = sp.load_npz(os.path.join(dir_, folder, 'topN_test_x_' + file_name + '.npz'))

In [34]:
fm = als.FMRegression(n_iter=100, init_stdev=0.1, l2_reg_w=0.1, l2_reg_V=0.1, l2_reg=0.1, rank=7)
fm.fit(train_x, train_y)
y_pred = fm.predict(test_x)
# fm = mcmc.FMRegression(n_iter=10, rank=5, init_stdev=0.1)
# y_pred = fm.fit_predict(train_x, train_y, test_x)
# fm = sgd.FMRegression(n_iter=100, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.1, l2_reg=0.1, step_size=0.1)
# fm.fit(train_x, train_y)
# y_pred = fm.predict(test_x)

In [35]:
len(y_pred), y_pred

(8169189,
 array([0.6182311 , 3.0941086 , 3.21867895, ..., 1.57663275, 1.42464158,
        0.95707294]))

In [36]:
predictions = pd.DataFrame(x_test, columns=['uid', 'tid'])
predictions.insert(2, 'rating', y_pred, False) 
predictions[:10]

Unnamed: 0,uid,tid,rating
0,0,10,0.618231
1,0,15,3.094109
2,0,18,3.218679
3,0,38,2.663397
4,0,41,3.362017
5,0,42,6.179886
6,0,62,2.209849
7,0,67,1.851938
8,0,71,1.358052
9,0,76,3.044021


In [37]:
test = pd.read_pickle(os.path.join(dir_, 'test_' + file_name[:-3] + 'pkl'))
test.sort_values(by=['uid','tid'])[:10]

Unnamed: 0,uid,tid,count,tags,rating
20,0,10,1,"[1, 12]",0.25
25,0,15,12,[0],4.75
29,0,18,7,"[9, 1]",4.25
54,0,38,4,[1],3.0
58,0,41,4,[1],3.0
59,0,42,2,[1],1.5
85,0,62,10,"[0, 12]",4.5
91,0,67,1,"[8, 0]",0.25
96,0,71,2,"[4, 1]",1.5
104,0,76,1,"[7, 12, 2, 5]",0.25


In [38]:
num_user = len(predictions['uid'].unique())
num_user

218

In [39]:
# Personal Recommendation
n=20
top_n_lists = []

for i in tqdm(range(num_user)):
    prediction = predictions[predictions['uid'] == i]
    prediction = prediction.sort_values(by=['rating'],  ascending=False)
    prediction = prediction[:n]
#     print(prediction)
    top_n_list = []
    for _, row in prediction.iterrows():
        top_n_list.append(row[1])
#     print(top_n_list)
    top_n_lists.append(top_n_list)

evaluation = []
satisfication = 0
for i in tqdm(range(num_user)):
    top_n_list = top_n_lists[i]
    precision = 0
    for j in top_n_list:
        p = test[test['uid'] == i]
        p = p[p['tid']==j]
        if len(p) > 0:
            precision += 1
    satisfication += precision / n
    evaluation.append(precision)
#     print(precision, satisfication)
print(satisfication/num_user)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=218.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=218.0), HTML(value='')))


0.010550458715596327
