In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import copy

In [2]:
rating_df = pd.read_csv('./ratings.dat', sep='::', names=["userId", "itemId", "rating", "timestamp"], engine='python')
rating_df.drop(columns=['timestamp'], inplace=True)
rating_df.drop(columns=['rating'], inplace=True)
rating_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,itemId
0,1,1193
1,1,661
2,1,914
3,1,3408
4,1,2355


In [3]:
item_df = pd.read_csv('./movies.dat', sep='::', names=['itemId', 'title', 'genres'], engine='python')
item_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,itemId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
item_genre_dict = dict()
for i in range(len(item_df)):
    genre_str = item_df.at[i, 'genres']
    genre_list = genre_str.split('|')
    item_genre_dict[item_df.at[i, 'itemId']] = genre_list

In [None]:
print(len(rating_df))
rating_df.drop_duplicates(subset =['itemId', 'userId'], 
                          keep = 'first', inplace = True)
print(len(rating_df))

In [None]:
item_set = set(rating_df['itemId'].unique())
user_set = set(rating_df['userId'].unique())
print('item num = ' + str(len(item_set)))
print('user num = ' + str(len(user_set)))

In [None]:
# count the number for each genre and sort
import operator
genre_count = dict()
for l in item_genre_dict:
    for g in item_genre_dict[l]:
        if not g in genre_count:
            genre_count[g] = 1
        else:
            genre_count[g] += 1

genre_count_sorted = sorted(genre_count.items(), key=operator.itemgetter(1), reverse=True)
genre_count_sorted

In [None]:
rating_df.reset_index(drop=True, inplace=True)
rdf_backup = copy.copy(rating_df)

In [None]:
rdf = copy.copy(rdf_backup)

In [None]:
# iteratively remove items and users with less than 2 reviews
rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
while np.min(rdf['user_freq']) <= 9:
    rdf.drop(rdf.index[rdf['user_freq'] <= 9], inplace=True)
    rdf.reset_index(drop=True, inplace=True)
    rdf['item_freq'] = rdf.groupby('itemId')['itemId'].transform('count')
    rdf.drop(rdf.index[rdf['item_freq'] <= 9], inplace=True)
    rdf.reset_index(drop=True, inplace=True)
    rdf['user_freq'] = rdf.groupby('userId')['userId'].transform('count')
    rdf.reset_index(drop=True, inplace=True)

In [None]:
item_list = rdf['itemId'].unique()
user_list = rdf['userId'].unique()
print('item num = ' + str(len(item_list)))
print('user num = ' + str(len(user_list)))

In [None]:
# get the user and item str id->int id dict
i = 0
user_old2new_id_dict = dict()
for u in user_list:
    if not u in user_old2new_id_dict:
        user_old2new_id_dict[u] = i
        i += 1
j = 0
item_old2new_id_dict = dict()
for i in item_list:
    if not i in item_old2new_id_dict:
        item_old2new_id_dict[i] = j
        j += 1

In [None]:
print('sparsity: ' + str(len(rdf) * 1.0 / (len(user_list) * len(item_list))))

In [None]:
# convert the str id of items in item_df to int id
for i in range(len(item_df)):
    if item_df.at[i, 'itemId'] not in item_old2new_id_dict:
        item_df.drop([i], axis=0, inplace=True)
    else:
        item_df.at[i, 'itemId'] = item_old2new_id_dict[item_df.at[i, 'itemId']]

In [None]:
# get df for rdf with int id for user and item
for i in range(len(rdf)):
    rdf.at[i, 'userId'] = user_old2new_id_dict[rdf.at[i, 'userId']]
    rdf.at[i, 'itemId'] = item_old2new_id_dict[rdf.at[i, 'itemId']]
item_list = rdf['itemId'].unique()
user_list = rdf['userId'].unique()

In [None]:
# get the df of train, vali, and test df
rdf.reset_index(inplace=True, drop=True)
train_df = rdf.copy()

train_ratio = 0.7
vali_ratio = 0.1
test_ratio = 0.2

vali_size = int(vali_ratio * len(rdf))
test_size = int(test_ratio * len(rdf))

vali_idx = np.random.choice(np.arange(len(train_df)), 
                            vali_size,
                            replace=False).tolist()
vali_df = train_df.copy()
vali_df = vali_df.loc[vali_idx]
train_df.drop(vali_idx, axis=0, inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_idx = np.random.choice(np.arange(len(train_df)), 
                            test_size,
                            replace=False).tolist()
test_df = train_df.copy()
test_df = test_df.loc[test_idx]
train_df.drop(test_idx, axis=0, inplace=True)

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
vali_df.reset_index(drop=True, inplace=True)

In [None]:
# compute the popularity of items, stored in item_df
item_pop = np.array(train_df['itemId'].value_counts())
item_pop_id = np.array(train_df['itemId'].value_counts().index)
item_df['pop'] = 0
for i in range(len(item_pop_id)):
    item_df.at[item_df['itemId'] == item_pop_id[i], 'pop'] = item_pop[i]

In [None]:
train_df.drop(columns=['user_freq', 'item_freq'], inplace=True)
test_df.drop(columns=['user_freq', 'item_freq'], inplace=True)
vali_df.drop(columns=['user_freq', 'item_freq'], inplace=True)

In [None]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
vali_df.reset_index(drop=True, inplace=True)

In [None]:
# generate list of items users like in train, vali, test sets for each user

num_item = len(item_list)
num_user = len(user_list)

user_train_like = []
user_test_like = [] 
user_vali_like = []  

train_array = train_df[['userId', 'itemId']].values
vali_array = vali_df[['userId', 'itemId']].values
test_array = test_df[['userId', 'itemId']].values

for u in user_list:
    train_like = (train_array[list(np.where(train_array[:, 0] == u)[0]), 1]).astype(int)
    vali_like = (vali_array[list(np.where(vali_array[:, 0] == u)[0]), 1]).astype(int)
    test_like = (test_array[list(np.where(test_array[:, 0] == u)[0]), 1]).astype(int)
    user_train_like.append(train_like)
    user_vali_like.append(vali_like)
    user_test_like.append(test_like)
    
np.save('./user_train_like.npy', np.array(user_train_like))
np.save('./user_vali_like.npy', np.array(user_vali_like))
np.save('./user_test_like.npy', np.array(user_test_like))
    

In [None]:
train_df.to_csv('./train_df.csv', index=False)
vali_df.to_csv('./vali_df.csv', index=False)
test_df.to_csv('./test_df.csv', index=False)

In [None]:
item_df.reset_index(drop=True, inplace=True)
movie_df = item_df.copy()
item_df.drop(columns=['title', 'genres'], inplace=True)
item_df.to_csv('./item_df.csv', index=False)
movie_df.to_csv('./movie_df.csv', index=False)

In [None]:
with open('./info.pkl', 'wb') as f:
    pickle.dump({'num_user': num_user, 'num_item': num_item}, f)