In [1]:
import pandas as pd
import numpy as np
import copy
import pickle

In [2]:
df = pd.read_csv('./user_artists.dat', delimiter="\t", header=0, dtype=np.int32)
df.drop(['weight'], axis=1, inplace=True)
user_list = np.unique(df['userID'])
item_list = np.unique(df['artistID'])
df.rename(columns={'userID': 'uid', 'artistID': 'iid'}, inplace=True)

In [3]:
np.random.shuffle(user_list)
userId_old2new_list = np.zeros(np.max(user_list) + 1)
userId_new2old_list = np.zeros_like(user_list)
i = 0
for u in user_list:
    userId_old2new_list[u] = i
    userId_new2old_list[i] = u
    i += 1

np.random.shuffle(item_list)
itemId_old2new_list = np.zeros(np.max(item_list) + 1)
itemId_new2old_list = np.zeros_like(item_list)
j = 0
for i in item_list:
    itemId_old2new_list[i] = j
    itemId_new2old_list[j] = i
    j += 1

In [4]:
u_array = df['uid'].values
i_array = df['iid'].values
u_array_new = userId_old2new_list[u_array]
i_array_new = itemId_old2new_list[i_array]
df['uid'] = u_array_new
df['iid'] = i_array_new

In [5]:
user_list = np.unique(df['uid'].values)
cold_user = np.random.choice(user_list, int(len(user_list) * 0.4), replace=False)
warm_user = np.array(list(set(user_list) - set(cold_user)))

In [6]:
test_df = copy.copy(df)
test_df = test_df[test_df['uid'].isin(cold_user)]

In [8]:
train = df[df['uid'].isin(warm_user)]

In [9]:
vali_user = np.random.choice(cold_user, int(len(cold_user) * 0.25), replace=False)
test_user = np.array(list(set(cold_user) - set(vali_user)))

In [10]:
vali_df = copy.copy(test_df)
vali_df = vali_df[vali_df['uid'].isin(vali_user)]
test_df = test_df[test_df['uid'].isin(test_user)]
train.reset_index(drop=True, inplace=True)
vali_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [13]:
print('total number of user = ' + str(len(user_list)))
print('total number of item = ' + str(len(item_list)))
print('#' * 20)
print('train')
print('number of user = ' + str(len(train['uid'].unique())))
print('number of item = ' + str(len(train['iid'].unique())))
print('number of interaction = ' + str(len(train)))
print('#' * 20)
print('vali')
print('number of user = ' + str(len(vali_df['uid'].unique())))
print('number of item = ' + str(len(vali_df['iid'].unique())))
print('number of interaction = ' + str(len(vali_df)))
print('#' * 20)
print('test')
print('number of user = ' + str(len(test_df['uid'].unique())))
print('number of item = ' + str(len(test_df['iid'].unique())))
print('number of interaction = ' + str(len(test_df)))

total number of user = 1892
total number of item = 17632
####################
train
number of user = 1136
number of item = 12850
number of interaction = 55810
####################
vali
number of user = 189
number of item = 3943
number of interaction = 9209
####################
test
number of user = 567
number of item = 8462
number of interaction = 27815


In [14]:
with open('./info.pkl', 'wb') as f:
    pickle.dump({'num_user': len(user_list), 'num_item': len(item_list)}, f)

In [15]:
train.to_csv('./train.csv', index=False)
vali_df.to_csv('./vali.csv', index=False)
test_df.to_csv('./test.csv', index=False)

In [16]:
friend_df = pd.read_csv('./user_friends.dat', delimiter="\t", header=0, dtype=np.int32)
user_array = friend_df['userID'].values
friend_array = friend_df['friendID'].values
user_array_new = userId_old2new_list[user_array]
friend_array_new = userId_old2new_list[friend_array]
friend_df['userID'] = user_array_new
friend_df['friendID'] = friend_array_new

In [19]:
from scipy.sparse import coo_matrix
row  = friend_df['userID'].values
col  = friend_df['friendID'].values
coo = coo_matrix((np.ones_like(row), (row, col)), 
                 shape=(len(user_list), len(user_list)))

In [20]:
import scipy.sparse
scipy.sparse.save_npz('./user_content.npz', coo)

In [None]:
import implicit

model = implicit.als.AlternatingLeastSquares(factors=200)

model.fit(item_user_data)