In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import copy

In [2]:
rdf = pd.read_csv('./ratings.csv', sep='::')
rdf.drop(columns=['time'], inplace=True)

  """Entry point for launching an IPython kernel.


In [3]:
item_df = pd.read_csv('./movies.csv', sep='::')
item_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
item_genre_dict = dict()
for i in range(len(item_df)):
    genre_str = item_df.at[i, 'genres']
    genre_list = genre_str.split('|')
    item_genre_dict[item_df.at[i, 'item_id']] = genre_list

In [5]:
item_set = set(rdf['item_id'].unique())
user_set = set(rdf['user_id'].unique())
print('item num = ' + str(len(item_set)))
print('user num = ' + str(len(user_set)))

item num = 10677
user num = 69878


In [6]:
# count the number for each genre and sort
import operator
genre_count = dict()
for l in item_genre_dict:
    for g in item_genre_dict[l]:
        if not g in genre_count:
            genre_count[g] = 1
        else:
            genre_count[g] += 1

genre_count_sorted = sorted(genre_count.items(), key=operator.itemgetter(1), reverse=True)
genre_count_sorted

[('Drama', 5339),
 ('Comedy', 3703),
 ('Thriller', 1706),
 ('Romance', 1685),
 ('Action', 1473),
 ('Crime', 1118),
 ('Adventure', 1025),
 ('Horror', 1013),
 ('Sci-Fi', 754),
 ('Fantasy', 543),
 ('Children', 528),
 ('War', 511),
 ('Mystery', 509),
 ('Documentary', 482),
 ('Musical', 436),
 ('Animation', 286),
 ('Western', 275),
 ('Film-Noir', 148),
 ('IMAX', 29),
 ('(no genres listed)', 1)]

In [7]:
key_genre = ['Comedy', 'Romance', 'Thriller', 'Action', 'Documentary', 'Sci-Fi', 'Animation', 'Horror']

# get the key_genre->item_list dict
key_genre_item = dict()
for k in key_genre:
    key_genre_item[k] = list()
for item in item_genre_dict:
    for g in item_genre_dict[item]:
        if g in key_genre:
            key_genre_item[g].append(item)

In [8]:
# collect all the items with key genres
key_item_set = set()
for genre in key_genre_item:
    key_item_set |= set(key_genre_item[genre])

nonkey_item_set = item_set - key_item_set

In [10]:
# remove the non-key genre items in rdf
remove_list = []
for item in nonkey_item_set:
    remove_list += rdf.index[rdf['item_id'] == item].values.tolist()   

In [11]:
rdf.drop(remove_list, inplace=True)

In [12]:
rdf.reset_index(drop=True, inplace=True)
rating_df = copy.copy(rdf)

In [13]:
rdf = copy.copy(rating_df)

In [15]:
# iteratively remove items and users with less than 2 reviews
rdf.reset_index(drop=True, inplace=True)

rdf['user_freq'] = rdf.groupby('user_id')['user_id'].transform('count')
rdf.drop(rdf.index[rdf['user_freq'] <= 70], inplace=True)
rdf.reset_index(drop=True, inplace=True)
rdf['item_freq'] = rdf.groupby('item_id')['item_id'].transform('count')
rdf.drop(rdf.index[rdf['item_freq'] <= 10], inplace=True)
rdf.reset_index(drop=True, inplace=True)
rdf['user_freq'] = rdf.groupby('user_id')['user_id'].transform('count')
rdf.reset_index(drop=True, inplace=True)
rdf['user_id'].value_counts()

59269    5167
67385    5125
14463    3814
3817     3602
27468    3299
68259    3272
19635    3138
58357    3024
63134    2803
8811     2552
6757     2482
19379    2425
30687    2409
42791    2409
31327    2398
47345    2353
62332    2336
56707    2286
47046    2280
27584    2274
38928    2251
7795     2223
30500    2214
59659    2194
58087    2182
1860     2137
59598    2127
43992    2122
17438    2115
30723    2107
         ... 
56681      71
24516      71
53934      71
12097      71
708        71
53980      71
4173       71
31700      71
58070      71
51158      71
35788      71
56929      71
66288      71
44652      71
11227      71
40549      71
726        71
42964      71
6698       71
67852      71
64055      71
17616      71
64723      71
37620      71
42518      71
67112      71
16707      71
27300      71
1231       71
62823      71
Name: user_id, Length: 30636, dtype: int64

In [16]:
item_list = rdf['item_id'].unique()
user_list = rdf['user_id'].unique()
print('item num = ' + str(len(item_list)))
print('user num = ' + str(len(user_list)))

item num = 7129
user num = 30636


In [17]:
# get the user and item str id->int id dict
i = 0
user_id_dict = dict()
for u in user_list:
    if not u in user_id_dict:
        user_id_dict[u] = i
        i += 1
j = 0
item_id_dict = dict()
for i in item_list:
    if not i in item_id_dict:
        item_id_dict[i] = j
        j += 1

In [18]:
print('sparsity: ' + str(len(rdf) * 1.0 / (len(user_list) * len(item_list))))

sparsity: 0.0322846448759


In [19]:
# get the df of train, vali, and test set
import numpy as np


train_df = rdf.copy()
vali_df = rdf.copy()
test_df = rdf.copy()

train_ratio = 0.6
vali_ratio = 0.2
test_ratio = 0.2
num_all = len(rdf)
vali_idx = []
test_idx = []

test_vali_idx = []
i = 0
num_user = len(user_list)
for u in user_list:
    u_idx = train_df.index[train_df['user_id'] == u]
    idx_len = len(u_idx)
    test_len = int(idx_len * (test_ratio + vali_ratio))
    if test_len == 0:
        test_len = 1
    tmp = np.random.choice(u_idx, size=test_len, replace=False)
    test_vali_idx += tmp.tolist()
    i += 1
    if i % 5000 == 0:
        print(str(i) + '/' + str(num_user))

# tmp = (np.random.choice(range(num_all), size=(test_len+vali_len), replace=False)).tolist()
test_len = int(len(test_vali_idx) * test_ratio / (test_ratio + vali_ratio))
vali_len = int(len(test_vali_idx) - test_len)
test_idx = (np.random.choice(test_vali_idx, size=test_len, replace=False)).tolist()
vali_idx = (np.random.choice(test_vali_idx, size=vali_len, replace=False)).tolist()

test_set = set(test_idx)
vali_set = set(vali_idx)
train_set = set(range(num_all)) - test_set - vali_set
train_idx = list(train_set)
train_df.drop((test_idx + vali_idx), axis=0, inplace=True)
test_df.drop((train_idx + vali_idx), axis=0, inplace=True)
vali_df.drop((train_idx + test_idx), axis=0, inplace=True)

5000/30636
10000/30636
15000/30636
20000/30636
25000/30636
30000/30636


In [20]:
# get the matrix of train, vali and test set

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
vali_df.reset_index(drop=True, inplace=True)
rdf.reset_index(drop=True, inplace=True)
train = np.zeros((len(user_list), len(item_list)))
test = np.zeros((len(user_list), len(item_list)))
vali = np.zeros((len(user_list), len(item_list)))
for r in range(len(train_df)):
    train[user_id_dict[train_df.at[r, 'user_id']], item_id_dict[train_df.at[r, 'item_id']]] = train_df.at[r, 'rating']
for r in range(len(test_df)):
    test[user_id_dict[test_df.at[r, 'user_id']], item_id_dict[test_df.at[r, 'item_id']]] = test_df.at[r, 'rating']
for r in range(len(vali_df)):
    vali[user_id_dict[vali_df.at[r, 'user_id']], item_id_dict[vali_df.at[r, 'item_id']]] = vali_df.at[r, 'rating']

In [21]:
# get the user int id-> str id list, and the same for item 
item_list = item_id_dict.keys()
item_idd_list = list()
for i in range(len(item_list)):
    item_idd_list.append('')
for item in item_id_dict:
    item_idd_list[item_id_dict[item]] = item

user_list = user_id_dict.keys()
user_idd_list = list()
for i in range(len(user_list)):
    user_idd_list.append('')
for user in user_id_dict:
    user_idd_list[user_id_dict[user]] = user
    
# get the item int id->genres list
item_idd_genre_list = list()
for i in range(len(item_idd_list)):
    item_idd_genre_list.append(item_genre_dict[item_idd_list[i]])

In [22]:
train_df.drop('user_freq', axis=1, inplace=True)
train_df.drop('item_freq', axis=1, inplace=True)
vali_df.drop('user_freq', axis=1, inplace=True)
vali_df.drop('item_freq', axis=1, inplace=True)
test_df.drop('user_freq', axis=1, inplace=True)
test_df.drop('item_freq', axis=1, inplace=True)
rdf.drop('user_freq', axis=1, inplace=True)
rdf.drop('item_freq', axis=1, inplace=True)

In [23]:
with open("rdf.pkl", "wb") as f:
    pickle.dump(rdf, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# get df for rdf, train, vali, test with int id for user and item
import pickle
import copy
# user_id_dict = pickle.load(open('./user_id_dict.pkl'))
# item_id_dict = pickle.load(open('./item_id_dict.pkl'))
# rdf = pickle.load(open('./rdf.pkl'))
rdf.reset_index(drop=True, inplace=True)
rating_df = copy.copy(rdf)
for i in range(len(rdf)):
    if i % 500000 == 0 or i > len(rdf):
        print(str(i) + '/' + str(len(rdf)))
    rating_df.at[i, 'user_id'] = user_id_dict[rdf.at[i, 'user_id']]
    rating_df.at[i, 'item_id'] = item_id_dict[rdf.at[i, 'item_id']]


In [None]:
with open("rating_df.pkl", "wb") as f:
    pickle.dump(rating_df, f, pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle
import copy
# user_id_dict = pickle.load(open('./user_id_dict.pkl'))
# item_id_dict = pickle.load(open('./item_id_dict.pkl'))
# train_df = pickle.load(open('./train_df.pkl'))
training_df = copy.copy(train_df)
for i in range(len(training_df)):
    training_df.at[i, 'user_id'] = user_id_dict[training_df.at[i, 'user_id']]
    training_df.at[i, 'item_id'] = item_id_dict[training_df.at[i, 'item_id']]

In [None]:
import pickle
import copy
# user_id_dict = pickle.load(open('./user_id_dict.pkl'))
# item_id_dict = pickle.load(open('./item_id_dict.pkl'))
# vali_df = pickle.load(open('./vali_df.pkl'))
valiing_df = copy.copy(vali_df)
for i in range(len(valiing_df)):
    valiing_df.at[i, 'user_id'] = user_id_dict[valiing_df.at[i, 'user_id']]
    valiing_df.at[i, 'item_id'] = item_id_dict[valiing_df.at[i, 'item_id']]
    
# test_df = pickle.load(open('./test_df.pkl'))
testing_df = copy.copy(test_df)
for i in range(len(testing_df)):
    testing_df.at[i, 'user_id'] = user_id_dict[testing_df.at[i, 'user_id']]
    testing_df.at[i, 'item_id'] = item_id_dict[testing_df.at[i, 'item_id']]

In [None]:
with open("training_df.pkl", "wb") as f:
    pickle.dump(training_df, f, pickle.HIGHEST_PROTOCOL)
with open("valiing_df.pkl", "wb") as f:
    pickle.dump(valiing_df, f, pickle.HIGHEST_PROTOCOL)
with open("testing_df.pkl", "wb") as f:
    pickle.dump(testing_df, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# generate the rating list for each key genre, get the genre->ratings dict
import numpy as np

# rdf = pickle.load(open('./rdf.pkl'))
# key_genre = pickle.load(open('./key_genre.pkl'))
# item_genre_dict = pickle.load(open('./item_genre_dict.pkl'))
rdf.reset_index(drop=True, inplace=True)
key_genre_rating = dict()
for k in key_genre:
    key_genre_rating[k] = list()
for r in range(len(rdf)):
    item = rdf.at[r, 'item_id']
    gl = item_genre_dict[item]
    for k in key_genre:
        if k in gl:
            key_genre_rating[k].append(rdf.at[r, 'rating'])

# generate the rating distribution for each genre
key_genre_rating_count = dict()
for k in key_genre:
    key_genre_rating_count[k] = np.zeros(10)
for k in key_genre_rating:
    rl = key_genre_rating[k]
    for r in rl:
        key_genre_rating_count[k][int((r - 0.5) / 0.5)] += 1

In [None]:
with open("item_genre_dict.pkl", "wb") as f:
    pickle.dump(item_genre_dict, f, pickle.HIGHEST_PROTOCOL)
with open("key_genre.pkl", "wb") as f:
    pickle.dump(key_genre, f, pickle.HIGHEST_PROTOCOL)
with open("key_genre_rating_count.pkl", "wb") as f:
    pickle.dump(key_genre_rating_count, f, pickle.HIGHEST_PROTOCOL)
with open("user_id_dict.pkl", "wb") as f:
    pickle.dump(user_id_dict, f, pickle.HIGHEST_PROTOCOL)
with open("item_id_dict.pkl", "wb") as f:
    pickle.dump(item_id_dict, f, pickle.HIGHEST_PROTOCOL)
with open("rdf.pkl", "wb") as f:
    pickle.dump(rdf, f, pickle.HIGHEST_PROTOCOL)
# with open("rating_df.pkl", "wb") as f:
#     pickle.dump(rating_df, f, pickle.HIGHEST_PROTOCOL)
# with open("training_df.pkl", "wb") as f:
#     pickle.dump(training_df, f, pickle.HIGHEST_PROTOCOL)
# with open("valiing_df.pkl", "wb") as f:
#     pickle.dump(valiing_df, f, pickle.HIGHEST_PROTOCOL)
# with open("testing_df.pkl", "wb") as f:
#     pickle.dump(testing_df, f, pickle.HIGHEST_PROTOCOL)
with open("item_idd_genre_list.pkl", "wb") as f:
    pickle.dump(item_idd_genre_list, f, pickle.HIGHEST_PROTOCOL)
with open("item_idd_list.pkl", "wb") as f:
    pickle.dump(item_idd_list, f, pickle.HIGHEST_PROTOCOL)
with open("user_idd_list.pkl", "wb") as f:
    pickle.dump(user_idd_list, f, pickle.HIGHEST_PROTOCOL)
with open("key_genre_rating.pkl", "wb") as f:
    pickle.dump(key_genre_rating, f, pickle.HIGHEST_PROTOCOL)
    
with open("train_df.pkl", "wb") as f:
    pickle.dump(train_df, f, pickle.HIGHEST_PROTOCOL)
with open("vali_df.pkl", "wb") as f:
    pickle.dump(vali_df, f, pickle.HIGHEST_PROTOCOL)
with open("test_df.pkl", "wb") as f:
    pickle.dump(test_df, f, pickle.HIGHEST_PROTOCOL)
with open("train.mat", "wb") as f:
    np.save(f, train)
with open("test.mat", "wb") as f:
    np.save(f, test)
with open("vali.mat", "wb") as f:
    np.save(f, vali)

In [None]:
# count the number for each genre and sort
import pickle
from operator import itemgetter
item_list = rdf['item_id'].unique()

genre_count = dict()
for i in item_list:
    gl = item_genre_dict[i]
    for g in gl:
        if g in key_genre:
            if not g in genre_count:
                genre_count[g] = 1
            else:
                genre_count[g] += 1

with open("genre_count.pkl", "wb") as f:
    pickle.dump(genre_count, f, pickle.HIGHEST_PROTOCOL)
                
genre_count_sorted = sorted(genre_count.items(), key=itemgetter(1), reverse=True)
genre_count_sorted

In [None]:
import numpy as np
import pickle
import copy as copy

item_idd_genre_list = np.array(item_idd_genre_list)


mask = 1.0 * (train > 0)
user_genre_count = list()
for u in range(train.shape[0]):
    temp_genre_count = copy.copy(genre_count)
    mask_u = mask[u, :]
    gll = item_idd_genre_list[mask_u == 1.0]
    for gl in gll:
        for g in gl:
            if g in key_genre:
                temp_genre_count[g] -= 1
    user_genre_count.append(temp_genre_count)
with open("user_genre_count.pkl", "wb") as f:
    pickle.dump(user_genre_count, f, pickle.HIGHEST_PROTOCOL)