In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import copy

In [2]:
rating_df = pd.read_csv('./ratings.dat', sep='::', names=["userId", "itemId", "rating", "timestamp"])
rating_df.drop(columns=['timestamp'], inplace=True)
rating_df.drop(columns=['rating'], inplace=True)
rating_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,itemId
0,1,1193
1,1,661
2,1,914
3,1,3408
4,1,2355


In [9]:
print(len(rating_df))
rating_df.drop_duplicates(subset =['itemId', 'userId'], 
                          keep = 'first', inplace = True)
print(len(rating_df))

996900
996900


In [10]:
item_set = set(rating_df['itemId'].unique())
user_set = set(rating_df['userId'].unique())
print('item num = ' + str(len(item_set)))
print('user num = ' + str(len(user_set)))

item num = 3472
user num = 6040


In [12]:
rating_df.reset_index(drop=True, inplace=True)
rdf_backup = copy.copy(rating_df)

In [13]:
rdf = copy.copy(rdf_backup)

In [14]:
# iteratively remove items and users with less than 2 reviews
rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
while np.min(rdf['user_freq']) <= 9:
    rdf.drop(rdf.index[rdf['user_freq'] <= 9], inplace=True)
    rdf.reset_index(drop=True, inplace=True)
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
    rdf.drop(rdf.index[rdf['item_freq'] <= 9], inplace=True)
    rdf.reset_index(drop=True, inplace=True)
    rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
    rdf.reset_index(drop=True, inplace=True)

In [15]:
item_list = rdf['itemId'].unique()
user_list = rdf['userId'].unique()
print('item num = ' + str(len(item_list)))
print('user num = ' + str(len(user_list)))
print('sparsity: ' + str(len(rdf) * 1.0 / (len(user_list) * len(item_list))))

item num = 3472
user num = 6040
sparsity: 0.047537347025971254


In [18]:
# get the user and item str id->int id dict
i = 0
user_old2new_id_dict = dict()
for u in user_list:
    if not u in user_old2new_id_dict:
        user_old2new_id_dict[u] = i
        i += 1
j = 0
item_old2new_id_dict = dict()
for i in item_list:
    if not i in item_old2new_id_dict:
        item_old2new_id_dict[i] = j
        j += 1

In [19]:
print('sparsity: ' + str(len(rdf) * 1.0 / (len(user_list) * len(item_list))))

sparsity: 0.047537347025971254


In [22]:
# get df for rdf with int id for user and item
for i in range(len(rdf)):
    rdf.at[i, 'userId'] = user_old2new_id_dict[rdf.at[i, 'userId']]
    rdf.at[i, 'itemId'] = item_old2new_id_dict[rdf.at[i, 'itemId']]
item_list = rdf['itemId'].unique()
user_list = rdf['userId'].unique()

In [23]:
# get the df of train, vali, and test df
rdf.reset_index(inplace=True, drop=True)
train_df = rdf.copy()

train_ratio = 0.7
vali_ratio = 0.1
test_ratio = 0.2

vali_size = int(vali_ratio * len(rdf))
test_size = int(test_ratio * len(rdf))

vali_idx = np.random.choice(np.arange(len(train_df)), 
                            vali_size,
                            replace=False).tolist()
vali_df = train_df.copy()
vali_df = vali_df.loc[vali_idx]
train_df.drop(vali_idx, axis=0, inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_idx = np.random.choice(np.arange(len(train_df)), 
                            test_size,
                            replace=False).tolist()
test_df = train_df.copy()
test_df = test_df.loc[test_idx]
train_df.drop(test_idx, axis=0, inplace=True)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
vali_df.reset_index(drop=True, inplace=True)

In [25]:
train_df.drop(columns=['user_freq', 'item_freq'], inplace=True)
test_df.drop(columns=['user_freq', 'item_freq'], inplace=True)
vali_df.drop(columns=['user_freq', 'item_freq'], inplace=True)

In [26]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
vali_df.reset_index(drop=True, inplace=True)

In [27]:
# generate list of items users like in train, vali, test sets for each user

num_item = len(item_list)
num_user = len(user_list)

user_train_like = []
user_test_like = [] 
user_vali_like = []  

train_array = train_df[['userId', 'itemId']].values
vali_array = vali_df[['userId', 'itemId']].values
test_array = test_df[['userId', 'itemId']].values

for u in user_list:
    train_like = (train_array[list(np.where(train_array[:, 0] == u)[0]), 1]).astype(int)
    vali_like = (vali_array[list(np.where(vali_array[:, 0] == u)[0]), 1]).astype(int)
    test_like = (test_array[list(np.where(test_array[:, 0] == u)[0]), 1]).astype(int)
    if len(vali_like) == 0:
        new_vali_idx = np.random.choice(np.arange(len(train_like)), size=1)
        new_vali = train_like[new_vali_idx]
        vali_like = np.array([new_vali])
        train_like = np.delete(train_like, new_vali_idx)
        train_array = np.delete(train_array, np.where((train_array[:, 0] == u) & (train_array[:, 1] == new_vali))[0], axis=0)
        vali_array = np.append(vali_array, [[u, new_vali]], axis=0)
    if len(test_like) == 0:
        new_test_idx = np.random.choice(np.arange(len(train_like)), size=1)
        new_test = train_like[new_test_idx]
        test_like = np.array([new_test])
        train_like = np.delete(train_like, new_test_idx)
        train_array = np.delete(train_array, np.where((train_array[:, 0] == u) & (train_array[:, 1] == new_test))[0], axis=0)
        test_array = np.append(test_array, [[u, new_test]], axis=0)
        
    user_train_like.append(train_like)
    user_vali_like.append(vali_like)
    user_test_like.append(test_like)
    
np.save('./user_train_like.npy', np.array(user_train_like))
np.save('./user_vali_like.npy', np.array(user_vali_like))
np.save('./user_test_like.npy', np.array(user_test_like))
    

In [28]:
train_df = pd.DataFrame({'userId': train_array[:, 0], 'itemId': train_array[:, 1]})
vali_df = pd.DataFrame({'userId': vali_array[:, 0], 'itemId': vali_array[:, 1]})
test_df = pd.DataFrame({'userId': test_array[:, 0], 'itemId': test_array[:, 1]})


train_df.to_csv('./train_df.csv', index=False)
vali_df.to_csv('./vali_df.csv', index=False)
test_df.to_csv('./test_df.csv', index=False)

In [29]:
len(train_df) + len(vali_df) + len(test_df)

996900

In [30]:
len(rdf)

996900

In [32]:
with open('./info.pkl', 'wb') as f:
    pickle.dump({'num_user': num_user, 'num_item': num_item}, f)