In [1]:
import os
import tqdm
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from tfcf.metrics import mae
from tfcf.metrics import rmse
from tfcf.datasets import ml1m
from tfcf.config import Config
from tfcf.models.svd import SVD
# from tfcf.models.svd import SVDPP
# from sklearn.model_selection import train_test_split

dir_ = '../../data/'
# file_name = 'normalized_minmax_filter_track_5_user_100.csv'
file_name = 'normalized_log_filter_track_5_user_100.csv'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# Note that x is a 2D numpy array, 
# x[i, :] contains the user-item pair, and y[i] is the corresponding rating.

df = pd.read_pickle(os.path.join(dir_, file_name[:-3] + 'pkl'))

x_train = np.loadtxt(os.path.join(dir_, 'train_x_' + file_name), delimiter=',')
y_train = np.loadtxt(os.path.join(dir_, 'train_y_' + file_name), delimiter=',')

In [3]:
config = Config()
config.num_users = len(df['uid'].unique())
config.num_items = len(df['tid'].unique())
config.min_value = df['rating'].min()
config.max_value = df['rating'].max()

In [4]:
# For top N
pd_train = pd.read_pickle(os.path.join(dir_, 'train_' + file_name[:-3] + 'pkl'))
num_users = len(df['uid'].unique())
num_tracks = len(df['tid'].unique())
print(num_users, num_tracks)

953 157567


In [5]:
track_ids = []
for i in range(num_tracks):
    track_ids.append(i)

all_tracks = pd.DataFrame()
all_tracks['tid'] = track_ids
all_tracks['count'] = 0

In [6]:
x_test = []

for i in tqdm(range(num_users)):
    
    user = pd_train[pd_train['uid']==i]
    top_n = all_tracks.set_index('tid').add(user.set_index('tid'), fill_value=0).reset_index()
    top_n = top_n[top_n['count']==0]
    top_n['uid'] = i
    top_n = top_n[['uid', 'tid']]
    top_n = top_n.values.tolist()
    x_test.extend(top_n)

  0%|          | 0/477 [00:00<?, ?it/s]

In [7]:
x_test = np.array(x_test)
print(x_test[0].shape)

(2,)


In [8]:
with tf.compat.v1.Session() as sess:
    # For SVD++ algorithm, if `dual` is True, then the dual term of items' 
    # implicit feedback will be added into the original SVD++ algorithm.
    # model = SVDPP(config, sess, dual=False)
    # model = SVDPP(config, sess, dual=True)
    
    model = SVD(config, sess)
    
#     model.train(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=1024)
    model.train(x_train, y_train, epochs=2, batch_size=32768)
    
    y_pred_1 = model.predict(x_test[:int(len(x_test)/2)])
    y_pred_2 = model.predict(x_test[int(len(x_test)/2):])
#     print('rmse: {}, mae: {}'.format(rmse(y_test, y_pred), mae(y_test, y_pred)))


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/2
Epoch 2/2


In [9]:
y_pred = np.append(y_pred_1, y_pred_2)
print(len(y_pred_1), len(y_pred_2), y_pred_1[0], y_pred_2[-1])
print(len(y_pred), y_pred[0], y_pred[-1])

36941986 36941987 0.43467742 0.59688693
73883973 0.43467742 0.59688693


In [10]:
df = pd.DataFrame(y_pred, columns=['rating'])
df2 = pd.DataFrame(x_test, columns=['uid', 'tid'])
df2.insert(2, 'rating', y_pred, False) 
df2[:10]

Unnamed: 0,uid,tid,rating
0,476,0,0.434677
1,476,1,0.429382
2,476,2,0.418726
3,476,3,0.411908
4,476,4,0.430531
5,476,5,0.446539
6,476,6,0.418778
7,476,7,0.451309
8,476,8,0.405626
9,476,9,0.46392


In [None]:
df2.to_pickle(os.path.join(dir_, 'prediction_svd_top_N_' + file_name[:-3] + 'pkl'))

In [11]:
df2.to_pickle(os.path.join(dir_, 'temp2.pkl'))

In [12]:
df1 = pd.read_pickle(os.path.join(dir_, 'temp1.pkl'))
df1

Unnamed: 0,uid,tid,rating
0,0,4,0.601343
1,0,14,0.636560
2,0,19,0.608919
3,0,21,0.581061
4,0,27,0.590191
...,...,...,...
73813663,475,157562,0.555950
73813664,475,157563,0.569963
73813665,475,157564,0.537223
73813666,475,157565,0.581616


In [13]:
frames = [df1, df2]
df3 = pd.concat(frames)
df3

Unnamed: 0,uid,tid,rating
0,0,4,0.601343
1,0,14,0.636560
2,0,19,0.608919
3,0,21,0.581061
4,0,27,0.590191
...,...,...,...
73883968,952,157562,0.590917
73883969,952,157563,0.593571
73883970,952,157564,0.557220
73883971,952,157565,0.600230
