In [1]:
import pandas as pd
import numpy as np
import pickle
import copy
import operator
from tqdm import tqdm
from scipy.sparse import coo_matrix
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
df = pd.read_csv('./ratings.dat', header=None, sep='::')
df.drop(columns=[2, 3], inplace=True)
df.rename(columns={0: 'uid', 1: 'iid'}, inplace=True)
df.head(2)

  """Entry point for launching an IPython kernel.


Unnamed: 0,uid,iid
0,1,1193
1,1,661


In [3]:
item_df = pd.read_csv('../ml20m/movies.csv')
item_df.rename(columns={'movieId': 'iid'}, inplace=True)
item_df.head(2)

Unnamed: 0,iid,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [4]:
feature_df = pd.read_csv('../ml20m/genome-scores.csv')
feature_df.rename(columns={'movieId': 'iid'}, inplace=True)
feature_df.head(2)

Unnamed: 0,iid,tagId,relevance
0,1,1,0.025
1,1,2,0.025


In [5]:
# delete items from feature_df that have all zero features
f_item_list = feature_df['iid'].unique()
f_score_list = feature_df['relevance'].values
f_score_mat = f_score_list.reshape((-1, np.max(feature_df['tagId'])))
keep_idx = np.where(np.sum(f_score_mat, axis=1) != 0)[0]
keep_item_f = f_item_list[keep_idx]
feature_df = feature_df[feature_df['iid'].isin(keep_item_f)]

In [6]:
# remove record for items which do not have features from df
df = df[df['iid'].isin(keep_item_f)]
df.reset_index(drop=True, inplace=True)

In [7]:
# remove items which do not have features from item_df
item_df = item_df[item_df['iid'].isin(keep_item_f)]
item_df.reset_index(drop=True, inplace=True)

In [8]:
# count the number for each genre and sort
item_genre_dict = dict()
for i in range(len(item_df)):
    genre_str = item_df.at[i, 'genres']
    genre_list = genre_str.split('|')
    item_genre_dict[item_df.at[i, 'iid']] = genre_list

genre_item_count = dict()
for l in item_genre_dict:
    for g in item_genre_dict[l]:
        if not g in genre_item_count:
            genre_item_count[g] = 1
        else:
            genre_item_count[g] += 1

genre_count_sorted = sorted(genre_item_count.items(), key=operator.itemgetter(1), reverse=True)
genre_count_sorted

[('Drama', 5037),
 ('Comedy', 3742),
 ('Thriller', 2010),
 ('Romance', 1754),
 ('Action', 1720),
 ('Crime', 1250),
 ('Adventure', 1176),
 ('Horror', 1025),
 ('Sci-Fi', 909),
 ('Fantasy', 695),
 ('Children', 620),
 ('Mystery', 615),
 ('Documentary', 468),
 ('Animation', 459),
 ('War', 440),
 ('Musical', 401),
 ('Western', 219),
 ('IMAX', 164),
 ('Film-Noir', 112),
 ('(no genres listed)', 1)]

In [9]:
# calculate feedback/item_count for each genre
genre_rating_count = dict()
itemIds = df['iid'].values
for i in range(len(itemIds)):
    itemId = itemIds[i]
    genres = item_genre_dict[itemId]
    for g in genres:
        if not g in genre_rating_count:
            genre_rating_count[g] = 1.
        else:
            genre_rating_count[g] += 1.
for g in genre_rating_count:
    genre_rating_count[g] /= genre_item_count[g] * 1.
genre_count_sorted = sorted(genre_rating_count.items(), key=operator.itemgetter(1), reverse=True)
genre_count_sorted

[('Sci-Fi', 176.25302530253026),
 ('Adventure', 168.77636054421768),
 ('Action', 147.5732558139535),
 ('Film-Noir', 132.90178571428572),
 ('Fantasy', 130.9453237410072),
 ('Children', 128.81612903225806),
 ('War', 122.25454545454545),
 ('Thriller', 119.21094527363184),
 ('Musical', 118.9077306733167),
 ('Mystery', 118.81138211382114),
 ('Western', 113.77168949771689),
 ('Crime', 112.1328),
 ('Romance', 107.38768529076397),
 ('Comedy', 103.60208444681989),
 ('Animation', 96.04575163398692),
 ('Drama', 87.14592019058964),
 ('Horror', 84.64487804878048),
 ('IMAX', 24.78048780487805),
 ('Documentary', 16.337606837606838)]

In [10]:
genres_used = set(list(genre_rating_count.keys()))
genres_used

{'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [11]:
# filter items by genres
delete_list = []
for i in range(len(item_df)):
    genre_str = item_df.at[i, 'genres']
    genre_list = genre_str.split('|')
    genre_overlap = genre_list
    if genre_str == '(no genres listed)':
        delete_list.append(i)
item_df.drop(delete_list, inplace=True)
keep_item_g = item_df['iid'].unique()
item_df.reset_index(drop=True, inplace=True)

In [12]:
# remove record for movies which do not have genres
df = df[df['iid'].isin(keep_item_g)]
df.reset_index(drop=True, inplace=True)

In [13]:
# remove items from feature_df which do not have genres
feature_df = feature_df[feature_df['iid'].isin(keep_item_g)]
feature_df.reset_index(drop=True, inplace=True)

In [14]:
item_set = set(df['iid'].unique())
user_set = set(df['uid'].unique())
print('item num = ' + str(len(item_set)))
print('user num = ' + str(len(user_set)))

item num = 3470
user num = 6040


In [15]:
rdf = copy.copy(df)

In [16]:
# iteratively remove items and users with less than 20 reviews
rdf.reset_index(drop=True, inplace=True)

while np.min(rdf['uid'].value_counts().values) <= 19:
    rdf['user_freq'] = rdf.groupby('uid')['uid'].transform('count')
    rdf.drop(rdf.index[rdf['user_freq'] <= 19], inplace=True)
    rdf.reset_index(drop=True, inplace=True)
    rdf['item_freq'] = rdf.groupby('iid')['iid'].transform('count')
    rdf.drop(rdf.index[rdf['item_freq'] <= 19], inplace=True)
    rdf.reset_index(drop=True, inplace=True)
    rdf['user_freq'] = rdf.groupby('uid')['uid'].transform('count')
    rdf.reset_index(drop=True, inplace=True)
# rdf['uid'].value_counts()

In [17]:
rdf.drop(columns=['user_freq', 'item_freq'], inplace=True)

In [18]:
item_list = rdf['iid'].unique()
user_list = rdf['uid'].unique()
print('movie num = ' + str(len(item_list)))
print('user num = ' + str(len(user_list)))
print('sparsity: ' + str(len(rdf) * 1.0 / (len(user_list) * len(item_list))))

movie num = 3018
user num = 6018
sparsity: 0.05464058454193417


In [19]:
#remove movies from item_df
item_df = item_df[item_df['iid'].isin(item_list)]
item_df.reset_index(drop=True, inplace=True)

In [20]:
#remove movies from feature_df
feature_df = feature_df[feature_df['iid'].isin(item_list)]
feature_df.reset_index(drop=True, inplace=True)

In [21]:
# get the user and item str id->int id dict
i = 0
user_old2new_id_dict = dict()
for u in user_list:
    if not u in user_old2new_id_dict:
        user_old2new_id_dict[u] = i
        i += 1
j = 0
item_old2new_id_dict = dict()
for i in item_list:
    if not i in item_old2new_id_dict:
        item_old2new_id_dict[i] = j
        j += 1

In [22]:
# convert the str id of items in item_df to int id
for i in range(len(item_df)):
    item_df.at[i, 'iid'] = item_old2new_id_dict[item_df.at[i, 'iid']]

In [23]:
# convert the str id of items in feature_df to int id
iid_array = feature_df['iid'].values
for i in range(len(iid_array)):
    iid_array[i] = item_old2new_id_dict[iid_array[i]]
feature_df['iid'] = iid_array

In [24]:
# get rdf with int id for user and item
userIds = rdf['uid'].values
itemIds = rdf['iid'].values
userIdsNew = copy.copy(userIds)
itemIdsNew = copy.copy(itemIds)
for i in range(len(userIds)):
    userIdsNew[i] = user_old2new_id_dict[userIds[i]]
    itemIdsNew[i] = item_old2new_id_dict[itemIds[i]]
rdf['uid'] = userIdsNew
rdf['iid'] = itemIdsNew
item_list = rdf['iid'].unique()
user_list = rdf['uid'].unique()

In [25]:
# split train, vali and test sets for cold start recommendation
# cold test 20%
# cold vali 5%
cold_item_idx = np.random.choice(np.arange(len(item_list)), int(len(item_list) * 0.40), replace=False).tolist()
cold_itemIds = item_list[cold_item_idx]
test_idx = np.random.choice(np.arange(len(cold_itemIds)), int(len(cold_itemIds) * 3. / 4.), replace=False).tolist()
vali_idx = list(set(range(len(cold_itemIds))) - set(test_idx))
cold_test_itemIds = cold_itemIds[test_idx]
cold_vali_itemIds = cold_itemIds[vali_idx]

cold_test_df = rdf[rdf['iid'].isin(cold_test_itemIds)]
cold_test_df.reset_index(drop=True, inplace=True)
cold_vali_df = rdf[rdf['iid'].isin(cold_vali_itemIds)]
cold_vali_df.reset_index(drop=True, inplace=True)

# train_df = rdf.drop(rdf[rdf['iid'].isin(cold_itemIds)].index, axis=0)
# train_df.reset_index(inplace=True, drop=True)

In [26]:
# split train, vali, and test sets for warm start recommendation
# warm test 20%
# warm vali 5%
warm_df = rdf.drop(rdf[rdf['iid'].isin(cold_itemIds)].index, axis=0)
warm_df.reset_index(inplace=True, drop=True)

train_df = warm_df.copy()

train_ratio = 0.9
vali_ratio = 0.1

vali_size = int(vali_ratio * len(warm_df))

vali_idx = np.random.choice(np.arange(len(train_df)), vali_size, replace=False).tolist()
warm_vali_df = train_df.copy()
warm_vali_df = warm_vali_df.loc[vali_idx]
train_df.drop(vali_idx, axis=0, inplace=True)

train_df.reset_index(drop=True, inplace=True)
warm_vali_df.reset_index(drop=True, inplace=True)

In [27]:
# remove users from warm and cold test and vali sets who are not in training set

train_user_list = train_df['uid'].unique()
warm_vali_df = warm_vali_df[warm_vali_df['uid'].isin(train_user_list)]
# warm_test_df = warm_test_df[warm_test_df['uid'].isin(train_user_list)]
cold_vali_df = cold_vali_df[cold_vali_df['uid'].isin(train_user_list)]
cold_test_df = cold_test_df[cold_test_df['uid'].isin(train_user_list)]
# warm_test_df.reset_index(drop=True, inplace=True)
warm_vali_df.reset_index(drop=True, inplace=True)
cold_vali_df.reset_index(drop=True, inplace=True)
cold_test_df.reset_index(drop=True, inplace=True)


In [28]:
# remove items from warm test and vali sets who are not in training set

train_item_list = train_df['iid'].unique()
warm_vali_df = warm_vali_df[warm_vali_df['iid'].isin(train_item_list)]
warm_vali_df.reset_index(drop=True, inplace=True)

# warm_test_df = warm_test_df[warm_test_df['iid'].isin(train_item_list)]
# warm_test_df.reset_index(drop=True, inplace=True)



In [29]:
rdf.to_csv('./rdf.csv', index=False)
train_df.to_csv('./train_df.csv', index=False)
warm_vali_df.to_csv('./warm_vali_df.csv', index=False)
# warm_test_df.to_csv('./warm_test_df.csv', index=False)
cold_vali_df.to_csv('./cold_vali_df.csv', index=False)
cold_test_df.to_csv('./cold_test_df.csv', index=False)

In [30]:
# count the number for each genre and sort
item_genre_dict = dict()
for i in range(len(item_df)):
    genre_str = item_df.at[i, 'genres']
    genre_list = genre_str.split('|')
    genre_overlap = set(genre_list).intersection(genres_used)
    item_genre_dict[item_df.at[i, 'iid']] = list(genre_overlap)

genre_item_count = dict()
for l in item_genre_dict:
    for g in item_genre_dict[l]:
        if not g in genre_item_count:
            genre_item_count[g] = 1
        else:
            genre_item_count[g] += 1

genre_count_sorted = sorted(genre_item_count.items(), key=operator.itemgetter(1), reverse=True)
genre_count_sorted

[('Drama', 1417),
 ('Comedy', 1096),
 ('Thriller', 569),
 ('Romance', 542),
 ('Action', 473),
 ('Adventure', 374),
 ('Crime', 326),
 ('Horror', 314),
 ('Sci-Fi', 273),
 ('Children', 256),
 ('Fantasy', 189),
 ('Mystery', 168),
 ('Musical', 131),
 ('War', 121),
 ('Animation', 103),
 ('Western', 69),
 ('Documentary', 57),
 ('Film-Noir', 40),
 ('IMAX', 5)]

In [31]:
# calculate feedback/item_count for each genre
genre_rating_count = dict()
itemIds = train_df['iid'].values
for i in range(len(itemIds)):
    itemId = itemIds[i]
    genres = item_genre_dict[itemId]
    for g in genres:
        if not g in genre_rating_count:
            genre_rating_count[g] = 1
        else:
            genre_rating_count[g] += 1
for g in genre_item_count:
    genre_rating_count[g] /= genre_item_count[g] * 1.
genre_count_sorted = sorted(genre_rating_count.items(), key=operator.itemgetter(1), reverse=True)
genre_count_sorted

[('IMAX', 531.4),
 ('Sci-Fi', 337.5274725274725),
 ('Action', 319.737843551797),
 ('Adventure', 316.24598930481284),
 ('War', 291.25619834710744),
 ('Crime', 251.76687116564418),
 ('Fantasy', 245.1216931216931),
 ('Thriller', 245.06854130052724),
 ('Mystery', 238.80357142857142),
 ('Animation', 223.2621359223301),
 ('Western', 208.30434782608697),
 ('Comedy', 203.46989051094891),
 ('Romance', 201.19741697416976),
 ('Film-Noir', 192.075),
 ('Musical', 186.3969465648855),
 ('Drama', 175.23570924488357),
 ('Children', 170.5859375),
 ('Horror', 126.45222929936305),
 ('Documentary', 79.29824561403508)]

In [32]:
item_df.reset_index(drop=True, inplace=True)
item_df.to_csv('./item_df.csv', index=False)

In [33]:
with open('./item_genre_dict.pkl', 'wb') as f:
    pickle.dump(item_genre_dict, f)

In [34]:
with open('./info.pkl', 'wb') as f:
    pickle.dump({'num_user': len(user_list), 'num_item': len(item_list)}, f)

In [35]:
# generate list of items users like in train, warm_vali, warm_test sets for each user

num_item = len(item_list)
num_user = len(user_list)

user_train_like = [[] for _ in range(num_user)]
# user_warm_test_like = [[] for _ in range(num_user)]
user_warm_vali_like = [[] for _ in range(num_user)]
user_cold_test_like = [[] for _ in range(num_user)]
user_cold_vali_like = [[] for _ in range(num_user)]

train_array = train_df[['uid', 'iid']].values
warm_vali_array = warm_vali_df[['uid', 'iid']].values
# warm_test_array = warm_test_df[['uid', 'iid']].values
cold_vali_array = cold_vali_df[['uid', 'iid']].values
cold_test_array = cold_test_df[['uid', 'iid']].values

for u in tqdm(user_list):
    idx = list(np.where(train_array[:, 0] == u)[0])
    train_like = (train_array[idx, 1]).astype(int)
    
    idx = list(np.where(warm_vali_array[:, 0] == u)[0])
    warm_vali_like = (warm_vali_array[idx, 1]).astype(int)
    
#     idx = list(np.where(warm_test_array[:, 0] == u)[0])
#     warm_test_like = (warm_test_array[idx, 1]).astype(int)
    
    idx = list(np.where(cold_vali_array[:, 0] == u)[0])
    cold_vali_like = (cold_vali_array[idx, 1]).astype(int)
    
    idx = list(np.where(cold_test_array[:, 0] == u)[0])
    cold_test_like = (cold_test_array[idx, 1]).astype(int)
    
    user_train_like[u] = train_like
    user_warm_vali_like[u] = warm_vali_like
#     user_warm_test_like[u] = warm_test_like
    user_cold_vali_like[u] = cold_vali_like
    user_cold_test_like[u] = cold_test_like
    
np.save('./user_train_like.npy', np.array(user_train_like))
np.save('./user_warm_vali_like.npy', np.array(user_warm_vali_like))
# np.save('./user_warm_test_like.npy', np.array(user_warm_test_like))
np.save('./user_cold_vali_like.npy', np.array(user_cold_vali_like))
np.save('./user_cold_test_like.npy', np.array(user_cold_test_like))

100%|██████████| 6018/6018 [00:04<00:00, 1318.94it/s]


In [36]:
iid_list = feature_df['iid'].values.astype(int)
tag_list = feature_df['tagId'].values.astype(int) - 1
score_list = feature_df['relevance'].values

feature_mat = coo_matrix((score_list, (iid_list, tag_list)), shape=(len(item_list), np.max(tag_list) + 1)).toarray()
np.save('./item_content.npy', feature_mat)

In [37]:
# compute audience size for all items in rdf, and convert the old item id to new id
item_pop = np.array(rdf['iid'].value_counts())
item_pop_id = np.array(rdf['iid'].value_counts().index)
item_AS_list = np.zeros(len(rdf['iid'].unique()))
for i in range(len(item_pop_id)):
    item_AS_list[item_pop_id[i]] = item_pop[i]

np.save('./item_audience_size_list.npy', item_AS_list)