In [2]:
import numpy as np
import tensorflow as tf
import time
import os
import math
import pandas as pd
from tqdm.notebook import tqdm
import scipy.sparse as sp
from tffm import TFFMClassifier
from tffm import TFFMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error 
from sklearn.metrics import roc_auc_score, accuracy_score

dir_ = '../../HPCF/us/test_0.8/data/'
file_name = 'normalized_to_rating_filter_track_5_user_100.csv'

In [3]:
# Note that x is a 2D numpy array, 
# x[i, :] contains the user-item pair, and y[i] is the corresponding rating.

df = pd.read_pickle(os.path.join(dir_, file_name[:-3] + 'pkl'))
df_train = pd.read_pickle(os.path.join(dir_, 'train_' + file_name[:-3] + 'pkl'))
train_y = np.loadtxt(os.path.join(dir_, 'train_y_' + file_name[:-3] + 'csv'), delimiter=',')
pop_count = pd.read_pickle(os.path.join(dir_, 'normalized_popularity_filter_track_5_user_100.pkl'))

num_users = len(df['uid'].unique())
num_tracks = len(df['tid'].unique())
print(num_users, num_tracks)

220 54969


In [4]:
unique_users = len(df['uid'].unique())
unique_tracks = len(df['tid'].unique())
l = unique_users + unique_tracks + 1
tid_start = unique_users
tag_start = unique_users + unique_tracks
print ('Tid start position : ' + str(tid_start))
print ('Tag start position : ' + str(tag_start))
print ('Length of vector : ' + str(l))
print('----------------------------------------')
print ('Number of unique users : ' + str(unique_users))
print ('Number of unique tracks : ' + str(unique_tracks))

Tid start position : 220
Tag start position : 55189
Length of vector : 55190
----------------------------------------
Number of unique users : 220
Number of unique tracks : 54969


In [5]:
pop_count = pop_count.sort_values(by=['count'],  ascending=False)
pop_count[:10]

Unnamed: 0,tid,count,rating
166,166,3666,573.25
457,457,2042,162.0
6338,6338,1896,153.75
80,80,1835,412.25
1364,1364,1792,327.5
350,350,1667,359.75
7209,7209,1454,400.5
6282,6282,1397,17.75
13104,13104,1368,57.5
3761,3761,1340,354.5


In [6]:
# min_max
max_rating = pop_count.iloc[0]['count']
pop_normalized = pop_count.copy()
pop_normalized = pop_normalized[['tid', 'count']]
pop_normalized['rating'] = pop_normalized['count']
pop_normalized['rating'] /= max_rating
pop_normalized['rating'] *= 5
pop_normalized[:10]

Unnamed: 0,tid,count,rating
166,166,3666,5.0
457,457,2042,2.785052
6338,6338,1896,2.585925
80,80,1835,2.502728
1364,1364,1792,2.444081
350,350,1667,2.273595
7209,7209,1454,1.983088
6282,6282,1397,1.905346
13104,13104,1368,1.865794
3761,3761,1340,1.827605


In [7]:
len(pop_normalized['tid'].unique())

54966

In [6]:
pop = {}
for _, r in pop_normalized.iterrows():
    pop[r[0]] = r[2]

In [7]:
# pop
i = 0
v = []
sp_rows = []
sp_cols = []
for index, row in tqdm(df_train.iterrows(), total=len(df_train)):
    sp_rows.append(i)
    sp_cols.append(row['uid'])
    v.append(1)
    
    sp_rows.append(i)
    sp_cols.append(tid_start + row['tid'])
    v.append(1)

    sp_rows.append(i)
    sp_cols.append(tag_start)
    v.append(pop[row['tid']])
        
    i += 1

train_x = sp.csr_matrix((v, (sp_rows, sp_cols)), shape=(len(df_train), l), dtype=float)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=561889.0), HTML(value='')))




In [8]:
track_ids = []
for i in range(unique_tracks):
    track_ids.append(i)

all_tracks = pd.DataFrame()
all_tracks['tid'] = track_ids
all_tracks['count'] = 0

In [9]:
x_test = []
for i in tqdm(range(unique_users)):
    user = df_train[df_train['uid']==i]
    top_n = all_tracks.set_index('tid').add(user.set_index('tid'), fill_value=0).reset_index()
    top_n = top_n[top_n['count']==0]
    top_n['uid'] = i
    top_n = top_n[['uid', 'tid']]
    top_n = top_n.values.tolist()
    x_test.extend(top_n)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=220.0), HTML(value='')))




In [27]:
i = 0
v = []
sp_rows = []
sp_cols = []
for row in tqdm(x_test):
    sp_rows.append(i)
    sp_cols.append(row[0])
    v.append(1)
    
    sp_rows.append(i)
    sp_cols.append(tid_start + row[1])
    v.append(1)
    
    sp_rows.append(i)
    sp_cols.append(tag_start)
    v.append(pop[row[1]])
    
    i += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11531291.0), HTML(value='')))




In [28]:
test_x = sp.csr_matrix((v, (sp_rows, sp_cols)), shape=(len(x_test), l), dtype=float)

In [29]:
test_x

<11531291x55190 sparse matrix of type '<class 'numpy.float64'>'
	with 34593873 stored elements in Compressed Sparse Row format>

In [30]:
print(test_x[0,:])

  (0, 0)	1.0
  (0, 223)	1.0
  (0, 55189)	0.06819421713038734


In [31]:
order = 3
model = TFFMRegressor(
    order=order, 
    rank=10, 
    optimizer=tf.train.AdamOptimizer(learning_rate=0.0001), 
    n_epochs=10, 
    batch_size=16384,
    init_std=0.00001,
    reg=0.0001,
    input_type='sparse'
)
model.fit(train_x, train_y, show_progress=True)
predictions = model.predict(test_x)








  return array(a, dtype, copy=False, order=order, subok=True)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





100%|██████████| 10/10 [00:08<00:00,  1.25epoch/s]


In [32]:
len(predictions), predictions

(11531291,
 array([0.09763829, 0.09613875, 0.10030535, ..., 0.088534  , 0.08856077,
        0.0825662 ], dtype=float32))

In [33]:
df2 = pd.DataFrame(x_test, columns=['uid', 'tid'])
df2.insert(2, 'rating', predictions, False) 
df2[:10]

Unnamed: 0,uid,tid,rating
0,0,3,0.097638
1,0,7,0.096139
2,0,18,0.100305
3,0,20,0.088779
4,0,31,0.114547
5,0,35,0.100687
6,0,37,0.119742
7,0,43,0.101087
8,0,54,0.09746
9,0,71,0.093974


In [35]:
df2.to_pickle(os.path.join('../data/sp_matrix_pop(old)', 'topN_pred_' + file_name[:-3] + 'pkl'))