In [1]:
import os
import tqdm
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from tfcf.metrics import mae
from tfcf.metrics import rmse
from tfcf.datasets import ml1m
from tfcf.config import Config
from tfcf.models.svd import SVD
# from tfcf.models.svd import SVDPP
# from sklearn.model_selection import train_test_split

dir_ = '../../data/'
# file_name = 'normalized_minmax_filter_track_5_user_100.csv'
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Note that x is a 2D numpy array, 
# x[i, :] contains the user-item pair, and y[i] is the corresponding rating.

df = pd.read_pickle(os.path.join(dir_, file_name[:-3] + 'pkl'))

x_train = np.loadtxt(os.path.join(dir_, 'train_x_' + file_name), delimiter=',')
y_train = np.loadtxt(os.path.join(dir_, 'train_y_' + file_name), delimiter=',')

In [3]:
config = Config()
config.num_users = len(df['uid'].unique())
config.num_items = len(df['tid'].unique())
config.min_value = df['rating'].min()
config.max_value = df['rating'].max()

In [4]:
# For top N
pd_train = pd.read_pickle(os.path.join(dir_, 'train_' + file_name[:-3] + 'pkl'))
num_users = len(df['uid'].unique())
num_tracks = len(df['tid'].unique())
print(num_users, num_tracks)

220 54969


In [5]:
track_ids = []
for i in range(num_tracks):
    track_ids.append(i)

all_tracks = pd.DataFrame()
all_tracks['tid'] = track_ids
all_tracks['count'] = 0

In [6]:
x_test = []

for i in tqdm(range(num_users)):
    
    user = pd_train[pd_train['uid']==i]
    top_n = all_tracks.set_index('tid').add(user.set_index('tid'), fill_value=0).reset_index()
    top_n = top_n[top_n['count']==0]
    top_n['uid'] = i
    top_n = top_n[['uid', 'tid']]
    top_n = top_n.values.tolist()
    x_test.extend(top_n)

HBox(children=(FloatProgress(value=0.0, max=220.0), HTML(value='')))




In [7]:
x_test

[[0, 3],
 [0, 7],
 [0, 18],
 [0, 20],
 [0, 31],
 [0, 35],
 [0, 37],
 [0, 43],
 [0, 54],
 [0, 71],
 [0, 72],
 [0, 76],
 [0, 88],
 [0, 90],
 [0, 106],
 [0, 109],
 [0, 116],
 [0, 118],
 [0, 122],
 [0, 123],
 [0, 125],
 [0, 126],
 [0, 134],
 [0, 141],
 [0, 143],
 [0, 147],
 [0, 150],
 [0, 151],
 [0, 154],
 [0, 162],
 [0, 165],
 [0, 175],
 [0, 177],
 [0, 181],
 [0, 182],
 [0, 187],
 [0, 199],
 [0, 216],
 [0, 218],
 [0, 243],
 [0, 253],
 [0, 254],
 [0, 255],
 [0, 256],
 [0, 257],
 [0, 260],
 [0, 267],
 [0, 270],
 [0, 275],
 [0, 278],
 [0, 282],
 [0, 283],
 [0, 294],
 [0, 296],
 [0, 301],
 [0, 307],
 [0, 308],
 [0, 318],
 [0, 330],
 [0, 333],
 [0, 343],
 [0, 350],
 [0, 355],
 [0, 365],
 [0, 373],
 [0, 374],
 [0, 403],
 [0, 404],
 [0, 408],
 [0, 430],
 [0, 432],
 [0, 435],
 [0, 445],
 [0, 449],
 [0, 451],
 [0, 454],
 [0, 455],
 [0, 460],
 [0, 463],
 [0, 467],
 [0, 470],
 [0, 484],
 [0, 487],
 [0, 489],
 [0, 490],
 [0, 497],
 [0, 511],
 [0, 512],
 [0, 514],
 [0, 522],
 [0, 533],
 [0, 538],
 [0,

In [7]:
x_test = np.array(x_test)
print(x_test[0].shape)

(2,)


In [8]:
with tf.compat.v1.Session() as sess:
    # For SVD++ algorithm, if `dual` is True, then the dual term of items' 
    # implicit feedback will be added into the original SVD++ algorithm.
    # model = SVDPP(config, sess, dual=False)
    # model = SVDPP(config, sess, dual=True)
    
    model = SVD(config, sess)
    
#     model.train(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=1024)
    model.train(x_train, y_train, epochs=2, batch_size=1024)
    
    y_pred = model.predict(x_test)
#     print('rmse: {}, mae: {}'.format(rmse(y_test, y_pred), mae(y_test, y_pred)))


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/2
Epoch 2/2


In [9]:
print(y_pred)

[2.1403582 2.2101755 2.3138006 ... 2.083058  2.4369228 2.153752 ]


In [10]:
df2 = pd.DataFrame(x_test, columns=['uid', 'tid'])
df2.insert(2, 'rating', y_pred, False) 
df2[:10]

Unnamed: 0,uid,tid,rating
0,0,3,2.140358
1,0,7,2.210176
2,0,18,2.313801
3,0,20,2.138048
4,0,31,2.377213
5,0,35,2.189802
6,0,37,2.094848
7,0,43,2.17073
8,0,54,2.191274
9,0,71,2.241641


In [11]:
# # Count POP to DF (according to users)
# tid_list = []
# pop_list = []
# for i in df['tid'].unique():
#     tid_list.append(i)
#     pop_list.append(len(df[df['tid']==i])/config.num_users)

In [12]:
# post = 0.05
# d = {'tid': tid_list, 'pop': pop_list}
# df_pop = pd.DataFrame(data=d)
# df_pop = df_pop.sort_values(by=['pop'], ascending=False)
# df_pop = df_pop[:int(len(df_pop)*post)]
# post_list = df_pop['tid'].tolist()
# len(post_list)

In [13]:
# # Count POP to DF (according to listen count)
file_name_pop = 'normalized_popularity_filter_track_5_user_100.pkl'
pop_count = pd.read_pickle(os.path.join(dir_, file_name_pop))
pop_count = pop_count.sort_values(by=['count'],  ascending=False)

In [98]:
post = 0.9
max_rating = pop_count.iloc[0]['count']
pop_normalized = pop_count.copy()
pop_normalized = pop_normalized[['tid', 'count']]
pop_normalized['rating'] = pop_normalized['count']
pop_normalized['rating'] /= max_rating
pop_normalized['rating'] *= 5
pop_normalized = pop_normalized[:int(len(pop_normalized)*post)]
post_list = pop_normalized['tid'].tolist()
len(post_list)

49469

In [99]:
post_df2 = df2[df2['tid'].isin(post_list)]
len(df2), len(post_df2), len(df2['tid'].unique()), len(post_df2['tid'].unique())

(11531291, 10342408, 54969, 49469)

In [100]:
post_df2.to_pickle(os.path.join(dir_, 'post_filtering_old', str(post) + '_prediction_svd_top_N_' + file_name[:-3] + 'pkl'))