In [2]:
import os
import tqdm
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
from tfcf.metrics import mae
from tfcf.metrics import rmse
from tfcf.datasets import ml1m
from tfcf.config import Config
from tfcf.models.svd import SVD
# from tfcf.models.svd import SVDPP
# from sklearn.model_selection import train_test_split

dir_ = '../../data/FM_data'
file_name = 'normalized_to_rating_filter_track_5_user_50.csv'

In [3]:
# Note that x is a 2D numpy array, 
# x[i, :] contains the user-item pair, and y[i] is the corresponding rating.
df = pd.read_pickle(os.path.join(dir_, file_name[:-3] + 'pkl'))
train = pd.read_pickle(os.path.join(dir_, 'train_' + file_name[:-3] + 'pkl'))

num_users = len(df['uid'].unique())
num_tracks = len(df['tid'].unique())
print(num_users, num_tracks)

218 39137


In [4]:
config = Config()
config.num_users = len(df['uid'].unique())
config.num_items = len(df['tid'].unique())
config.min_value = df['rating'].min()
config.max_value = df['rating'].max()

In [5]:
x_train = []
y_train = []
pbar = tqdm(total = len(train))
for index, row in train.iterrows():
    x_train.append([row['uid'], row['tid']])
    y_train.append(row['rating'])
    pbar.update(1)
pbar.close()

HBox(children=(FloatProgress(value=0.0, max=362677.0), HTML(value='')))




In [6]:
all_tracks = pd.DataFrame()
all_tracks['tid'] = train['tid'].unique()
all_tracks['count'] = 0

In [7]:
x_test = []

for i in tqdm(range(num_users)):
    user = train[train['uid']==i]
    top_n = all_tracks.set_index('tid').add(user.set_index('tid'), fill_value=0).reset_index()
    top_n = top_n[top_n['count']==0]
    top_n['uid'] = i
    top_n = top_n[['uid', 'tid']]
    top_n = top_n.values.tolist()
    x_test.extend(top_n)

HBox(children=(FloatProgress(value=0.0, max=218.0), HTML(value='')))




In [8]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
print(x_train[0].shape, y_train.shape, x_test[0].shape)

(2,) (362677,) (2,)


In [9]:
with tf.compat.v1.Session() as sess:
    # For SVD++ algorithm, if `dual` is True, then the dual term of items' 
    # implicit feedback will be added into the original SVD++ algorithm.
    # model = SVDPP(config, sess, dual=False)
    # model = SVDPP(config, sess, dual=True)
    
    model = SVD(config, sess)
    
#     model.train(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=1024)
    model.train(x_train, y_train, epochs=2, batch_size=1024)
    
    y_pred = model.predict(x_test)
#     print('rmse: {}, mae: {}'.format(rmse(y_test, y_pred), mae(y_test, y_pred)))


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/2
Epoch 2/2


In [10]:
print(y_pred)

[2.067874  2.1665392 2.2195141 ... 2.2124841 2.2187612 2.259934 ]


In [11]:
df2 = pd.DataFrame(x_test, columns=['uid', 'tid'])
df2.insert(2, 'rating', y_pred, False) 
df2[:10]

Unnamed: 0,uid,tid,rating
0,0,2,2.067874
1,0,14,2.166539
2,0,15,2.219514
3,0,20,2.19584
4,0,21,2.035834
5,0,24,2.140088
6,0,30,2.082822
7,0,44,2.182342
8,0,46,2.20247
9,0,48,2.318919


In [12]:
# Count POP to DF (according to users)
tid_list = []
pop_list = []
for i in df['tid'].unique():
    tid_list.append(i)
    pop_list.append(len(df[df['tid']==i])/config.num_users)

In [78]:
post = 0.005
d = {'tid': tid_list, 'pop': pop_list}
df_pop = pd.DataFrame(data=d)
df_pop = df_pop.sort_values(by=['pop'], ascending=False)
df_pop = df_pop[:int(len(df_pop)*post)]
post_list = df_pop['tid'].tolist()
len(post_list)

195

In [79]:
# # # Count POP to DF (according to listen count)
# file_name_pop = 'normalized_popularity_filter_track_5_user_100.pkl'
# pop_count = pd.read_pickle(os.path.join(dir_, file_name_pop))
# pop_count = pop_count.sort_values(by=['count'],  ascending=False)

In [80]:
# post = 0.9
# max_rating = pop_count.iloc[0]['count']
# pop_normalized = pop_count.copy()
# pop_normalized = pop_normalized[['tid', 'count']]
# pop_normalized['rating'] = pop_normalized['count']
# pop_normalized['rating'] /= max_rating
# pop_normalized['rating'] *= 5
# pop_normalized = pop_normalized[:int(len(pop_normalized)*post)]
# post_list = pop_normalized['tid'].tolist()
# len(post_list)

In [81]:
post_df2 = df2[df2['tid'].isin(post_list)]
len(df2), len(post_df2), len(df2['tid'].unique()), len(post_df2['tid'].unique())

(8168317, 32254, 39133, 195)

In [82]:
post_df2.to_pickle(os.path.join(dir_, 'post_filtering(new)', str(post) + '_prediction_svd_top_N_' + file_name[:-3] + 'pkl'))