In [15]:
import pandas as pd
from collections import Counter
from sklearn.utils import shuffle
import pickle

In [2]:
data = pd.read_csv("../data/BX-Book-Ratings.csv",sep=";")

In [3]:
data

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [4]:
data["User-ID"].describe()

count    1.149780e+06
mean     1.403864e+05
std      8.056228e+04
min      2.000000e+00
25%      7.034500e+04
50%      1.410100e+05
75%      2.110280e+05
max      2.788540e+05
Name: User-ID, dtype: float64

In [3]:
def get_unique_idx(unique_values):
    unique_idx={}
    count=0
    for v in unique_values:
        unique_idx[v]=count
        count+=1
    return unique_idx

In [6]:
unique_user_ids = set(data["User-ID"].values)
unique_isbn = set(data["ISBN"].values)

In [7]:
print("Number of users", len(unique_user_ids))
print("Number of books", len(unique_isbn))

Number of users 105283
Number of books 340556


In [8]:
unique_user_idx = get_unique_idx(unique_user_ids)
unique_isbn_idx = get_unique_idx(unique_isbn)

In [9]:
data['user_idx'] = data["User-ID"].apply(lambda x: unique_user_idx[x])
data['isbn_idx'] = data["ISBN"].apply(lambda x: unique_isbn_idx[x])

In [10]:
data.to_csv("../data/processed_rating.csv")

In [11]:
U = data.user_idx.max()+1
B = data.isbn_idx.max()+1

In [13]:
user_count = Counter(data.user_idx)
isbn_count = Counter(data.isbn_idx)

In [14]:
# Now we will take 100000 user and 100000 books
u = 100000
b = 100000


# https://stackoverflow.com/questions/3594514/how-to-find-most-common-elements-of-a-list

user_ids = [u for u, c in user_count.most_common(u)]
book_ids = [b for b, c in isbn_count.most_common(b)]

data_updated = data[data["user_idx"].isin(user_ids) & data["isbn_idx"].isin(book_ids)].copy()

In [15]:
unique_user_idx = get_unique_idx(user_ids)
unique_isbn_idx = get_unique_idx(book_ids)

data_updated['user_idx'] = data_updated["user_idx"].apply(lambda x: unique_user_idx[x])
data_updated['isbn_idx'] = data_updated["isbn_idx"].apply(lambda x: unique_isbn_idx[x])

In [16]:
print("max user_id",data_updated["user_idx"].max())
print("max isbn_id",data_updated["isbn_idx"].max())

max user_id 99999
max isbn_id 99999


In [17]:
data_updated.to_csv("../data/ratings_100000.csv")

In [18]:
data_updated = pd.read_csv("../data/ratings_100000.csv")

In [19]:
U = data_updated.user_idx.max()+1
B = data_updated.isbn_idx.max()+1

In [20]:
data_updated = shuffle(data_updated)

In [21]:
cut_off = int(0.8*len(data_updated))

In [22]:
df_train = data_updated.iloc[:cut_off]
df_test = data_updated.iloc[cut_off:]

In [23]:
user_2_book = {}
book_2_user = {}
user_book_rating ={}

def populate_mapping_train(row):
    u = int(row.user_idx)
    b = int(row.isbn_idx)
    
    if u not in user_2_book:
        user_2_book[u] = [b]
    else:
        user_2_book[u].append(b)
        
    if b not in book_2_user:
        book_2_user[b] = [u]
    else:
        book_2_user[b].append(u)
        
    user_book_rating[(u,b)] = row["Book-Rating"]
        
df_train.apply(populate_mapping_train,axis=1)  

568769    None
380149    None
681335    None
712788    None
848619    None
          ... 
174843    None
288926    None
722942    None
317334    None
162734    None
Length: 689408, dtype: object

In [24]:
user_book_rating_test={}
def populate_mapping_test(row):
    u = int(row.user_idx)
    b = int(row.isbn_idx)
    
    user_book_rating_test[(u,b)] = row["Book-Rating"]

df_test.apply(populate_mapping_test,axis=1)  

605378    None
761742    None
277191    None
292198    None
848945    None
          ... 
244476    None
698964    None
69263     None
608230    None
791326    None
Length: 172352, dtype: object

In [25]:
user_2_book[17383]

[14417, 602, 443, 280, 424]

In [26]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [27]:
save_obj(user_2_book,"../data/user_2_book")
save_obj(book_2_user,"../data/book_2_user")
save_obj(user_book_rating,"../data/user_book_rating")
save_obj(user_book_rating_test,"../data/user_book_rating_test")
