In [38]:
import numpy as np
import pandas as pd
import copy
import pickle

In [39]:
raw_matrix = np.loadtxt('./train.ascii')
userId = np.where(raw_matrix > 0)[0]
itemId = np.where(raw_matrix > 0)[1]
rating = raw_matrix[np.where(raw_matrix > 0)]
train_df = pd.DataFrame({'userId': userId, 'itemId': itemId, 'rating': rating})
raw_matrix = np.loadtxt('./test.ascii')
userId = np.where(raw_matrix > 0)[0]
itemId = np.where(raw_matrix > 0)[1]
rating = raw_matrix[np.where(raw_matrix > 0)]
test_df = pd.DataFrame({'userId': userId, 'itemId': itemId, 'rating': rating})

In [40]:
item_set = set(train_df['itemId'].unique())
train_user_set = set(train_df['userId'].unique())
test_user_set = set(test_df['userId'].unique())
print('item num = ' + str(len(item_set)))
print('train user num = ' + str(len(train_user_set)))
print('test user num = ' + str(len(test_user_set)))

item num = 300
train user num = 290
test user num = 290


In [41]:
'''rating equal or larger than 4 as positive feedback'''

train_df.drop(train_df[train_df['rating'] <= 3].index, inplace=True)
train_df.reset_index(inplace=True, drop=True)
train_df.drop(columns=['rating'], inplace=True)

In [42]:
len(test_df)

4640

In [43]:
train_item_list = train_df['itemId'].unique()
test_df = test_df.loc[test_df[test_df['itemId'].isin(train_item_list)].index]

In [44]:
len(test_df)

4407

In [45]:
'''rating equal or larger than 4 as positive feedback'''

test_df.loc[test_df['rating'] <= 3, 'rating'] = 0
test_df.loc[test_df['rating'] > 3, 'rating'] = 1

In [46]:
print('item num = ' + str(len(train_df['itemId'].unique())))
print('train user num = ' + str(len(train_df['userId'].unique())))
print('test user num = ' + str(len(test_df['userId'].unique())))

item num = 284
train user num = 290
test user num = 290


In [47]:
'''in test set, only keep users who have at least one positive and one 
negative feedback, and have at least two positive feedback in training set'''

user_list = []
for u in test_user_set:
    test_num = len(test_df.loc[test_df['userId'] == u])
    pos_num = np.sum(test_df.loc[test_df['userId'] == u, 'rating'])
    train_num = len(train_df.loc[train_df['userId'] == u])
    if pos_num >= 1 and pos_num < test_num and train_num >= 2:
        user_list.append(u)

In [48]:
test_df.drop(test_df[~test_df['userId'].isin(user_list)].index, inplace=True)
test_df.reset_index(inplace=True, drop=True)

In [49]:
'''get the user old id->new id dict'''

all_user_list = train_df['userId'].unique()
all_item_list = train_df['itemId'].unique()
j = 0
user_old2new_id_dict = dict()
for u in all_user_list:
    if not u in user_old2new_id_dict:
        user_old2new_id_dict[u] = j
        j += 1
        
j = 0
item_old2new_id_dict = dict()
for i in all_item_list:
    if not i in item_old2new_id_dict:
        item_old2new_id_dict[i] = j
        j += 1

In [50]:
'''reindex users and items in training and test set'''

for i in range(len(train_df)):
    train_df.at[i, 'userId'] = user_old2new_id_dict[train_df.at[i, 'userId']]
    train_df.at[i, 'itemId'] = item_old2new_id_dict[train_df.at[i, 'itemId']]
train_user_list = train_df['userId'].unique()
train_item_list = train_df['itemId'].unique()
for i in range(len(test_df)):
    test_df.at[i, 'userId'] = user_old2new_id_dict[test_df.at[i, 'userId']]
    test_df.at[i, 'itemId'] = item_old2new_id_dict[test_df.at[i, 'itemId']]
test_user_list = test_df['userId'].unique()
test_item_list = test_df['itemId'].unique()

all_user_list = train_df['userId'].unique()
all_item_list = train_df['itemId'].unique()

In [51]:
print('item num = ' + str(len(train_df['itemId'].unique())))
print('train user num = ' + str(len(train_df['userId'].unique())))
print('test user num = ' + str(len(test_df['userId'].unique())))

item num = 284
train user num = 290
test user num = 225


In [52]:
'''in the training set, for each user, randomly select 160 items as validation set'''

from scipy.sparse import coo_matrix
num_user = len(train_df['userId'].unique())
num_item = len(train_df['itemId'].unique())
train_mat = coo_matrix((np.ones(len(train_df)),
                        (train_df['userId'].values, train_df['itemId'].values)), 
                       shape=(num_user, num_item)).toarray()
user_vali_list = []
user_vali_interact = []
user_vali_like = []
for u in range(num_user):
    vali_interact = np.random.choice(np.arange(num_item), 80, replace=False).tolist()
    vali_like = np.where(train_mat[u, vali_interact] == 1)[0]
    if len(vali_like) > 0 and len(vali_like) < 80:
        user_vali_list.append(u)
        user_vali_interact.append(vali_interact)
        user_vali_like.append(vali_like) 
        train_mat[u, vali_interact] = 0

np.save('./user_vali_list.npy', np.array(user_vali_list))
np.save('./user_vali_interact.npy', np.array(user_vali_interact))
np.save('./user_vali_like.npy', np.array(user_vali_like))


In [53]:
'''have the new training set after split a vali set'''

userId = np.where(train_mat == 1)[0]
itemId = np.where(train_mat == 1)[1]
train_df = pd.DataFrame({'userId': userId, 'itemId': itemId})

In [54]:
'''compute the popularity of items in training set'''
item_pop = np.array(train_df['itemId'].value_counts())
item_pop_id = np.array(train_df['itemId'].value_counts().index)
item_pop_list = [0] * len(all_item_list)
for i in range(len(item_pop_id)):
    item_pop_list[item_pop_id[i]] = item_pop[i]

In [55]:
'''generate list of items users like in train and vali sets for each user'''

user_train_like = []

train_array = train_df[['userId', 'itemId']].values

for u in all_user_list:
    train_like = (train_array[list(np.where(train_array[:, 0] == u)[0]), 
                              1]).astype(int)
    user_train_like.append(train_like)
    
np.save('./user_train_like.npy', np.array(user_train_like))
    

In [56]:
train_df.to_csv('./train_df.csv', index=False)
test_df.to_csv('./test_df.csv', index=False)

np.save('./item_pop.npy', np.array(item_pop_list))

with open('./info.pkl', 'wb') as f:
    pickle.dump({'num_user': len(all_user_list), 'num_item': len(all_item_list)}, f)

In [57]:
'''generate list of items users interact and like in test set for each user'''

user_list = test_df['userId'].unique()

user_test_interact = []
user_test_like = []  

test_array = test_df[['userId', 'itemId', 'rating']].values

for u in user_list:
    test_interact = (test_array[list(np.where(test_array[:, 0] == u)[0]),
                                1]).astype(int)
    tmp = (test_array[list(np.where(test_array[:, 0] == u)[0]), 2]).astype(int)
    test_like = np.where(tmp == 1)[0]
    user_test_interact.append(test_interact)
    user_test_like.append(test_like)
    
np.save('./user_test_list.npy', np.array(user_list))
np.save('./user_test_interact.npy', np.array(user_test_interact))
np.save('./user_test_like.npy', np.array(user_test_like))
    