In [1]:
import pandas as pd
from collections import Counter
from sklearn.utils import shuffle
import pickle

In [2]:
data = pd.read_csv("../data/BX-Book-Ratings.csv",sep=";")

In [3]:
data

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [4]:
data["User-ID"].describe()

count    1.149780e+06
mean     1.403864e+05
std      8.056228e+04
min      2.000000e+00
25%      7.034500e+04
50%      1.410100e+05
75%      2.110280e+05
max      2.788540e+05
Name: User-ID, dtype: float64

In [5]:
data = data[data["Book-Rating"]!=0].copy()

In [6]:
data

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6
...,...,...,...
1149773,276704,0806917695,5
1149775,276704,1563526298,9
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [7]:
def get_unique_idx(unique_values):
    unique_idx={}
    count=0
    for v in unique_values:
        unique_idx[v]=count
        count+=1
    return unique_idx

In [8]:
unique_user_ids = set(data["User-ID"].values)
unique_isbn = set(data["ISBN"].values)

In [9]:
print("Number of users", len(unique_user_ids))
print("Number of books", len(unique_isbn))

Number of users 77805
Number of books 185973


In [10]:
unique_user_idx = get_unique_idx(unique_user_ids)
unique_isbn_idx = get_unique_idx(unique_isbn)

In [11]:
data['user_idx'] = data["User-ID"].apply(lambda x: unique_user_idx[x])
data['isbn_idx'] = data["ISBN"].apply(lambda x: unique_isbn_idx[x])

In [12]:
data.to_csv("../data/processed_rating.csv")

In [13]:
user_count = Counter(data.user_idx)
isbn_count = Counter(data.isbn_idx)

In [14]:
# Now we will take 50000 user and 50000 books
U = 10000
B = 50000


# https://stackoverflow.com/questions/3594514/how-to-find-most-common-elements-of-a-list

user_ids = [u for u, c in user_count.most_common(U)]
book_ids = [b for b, c in isbn_count.most_common(B)]

data_updated = data[(data["user_idx"].isin(user_ids) & data["isbn_idx"].isin(book_ids))].copy()

In [15]:
unique_user_idx = get_unique_idx(set(data_updated["user_idx"].values))
unique_isbn_idx = get_unique_idx(set(data_updated["isbn_idx"].values))

data_updated['user_idx'] = data_updated["user_idx"].apply(lambda x: unique_user_idx[x])
data_updated['isbn_idx'] = data_updated["isbn_idx"].apply(lambda x: unique_isbn_idx[x])

In [16]:
print("max user_id",data_updated["user_idx"].max())
print("max isbn_id",data_updated["isbn_idx"].max())

max user_id 9926
max isbn_id 47737


In [17]:
len(set(data_updated["user_idx"].values))

9927

In [18]:
data_updated.to_csv("../data/ratings_100000.csv")

In [19]:
data_updated = pd.read_csv("../data/ratings_100000.csv")

In [20]:
U = data_updated.user_idx.max()+1
B = data_updated.isbn_idx.max()+1

In [21]:
data_updated = shuffle(data_updated)

In [22]:
cut_off = int(0.8*len(data_updated))

In [23]:
df_train = data_updated.iloc[:cut_off]
df_test = data_updated.iloc[cut_off:]

df_train.to_csv("../data/train.csv")
df_test.to_csv("../data/test.csv")

In [24]:
user_2_book = {}
book_2_user = {}
user_book_rating ={}

def populate_mapping_train(row):
    u = int(row.user_idx)
    b = int(row.isbn_idx)
    
    if u not in user_2_book:
        user_2_book[u] = [b]
    else:
        user_2_book[u].append(b)
        
    if b not in book_2_user:
        book_2_user[b] = [u]
    else:
        book_2_user[b].append(u)
        
    user_book_rating[(u,b)] = row["Book-Rating"]
        
df_train.apply(populate_mapping_train,axis=1)  

115631    None
147507    None
165699    None
166945    None
97366     None
          ... 
190932    None
64012     None
197360    None
95929     None
174582    None
Length: 167010, dtype: object

In [25]:
user_2_book[1271]

[4758, 13445, 37824, 2242, 16366]

In [26]:
user_book_rating_test={}
def populate_mapping_test(row):
    u = int(row.user_idx)
    b = int(row.isbn_idx)
    
    user_book_rating_test[(u,b)] = row["Book-Rating"]

df_test.apply(populate_mapping_test,axis=1)  

34107     None
207469    None
53116     None
12528     None
198582    None
          ... 
159464    None
149697    None
186789    None
25266     None
99646     None
Length: 41753, dtype: object

In [27]:
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [28]:
save_obj(user_2_book,"../data/user_to_book")
save_obj(book_2_user,"../data/book_to_user")
save_obj(user_book_rating,"../data/user_book_rating")
save_obj(user_book_rating_test,"../data/user_book_rating_test")


In [29]:
user_book_rating

{(2335, 13231): 10,
 (568, 29423): 9,
 (6136, 29582): 7,
 (6470, 12358): 9,
 (1102, 24766): 5,
 (370, 45168): 8,
 (8688, 32619): 9,
 (4568, 40703): 10,
 (1175, 19685): 10,
 (7048, 47285): 10,
 (8141, 8458): 8,
 (5090, 17695): 8,
 (8694, 22100): 10,
 (3293, 38659): 6,
 (9700, 2095): 8,
 (3596, 30310): 9,
 (3913, 28382): 9,
 (344, 7947): 8,
 (5812, 3536): 10,
 (7267, 18058): 4,
 (9195, 14627): 9,
 (2102, 27441): 6,
 (6913, 18655): 9,
 (5972, 39768): 8,
 (901, 41000): 9,
 (377, 4862): 8,
 (3528, 37684): 6,
 (6314, 19238): 8,
 (7900, 27328): 6,
 (3913, 36772): 10,
 (5441, 27958): 10,
 (4515, 19670): 8,
 (2396, 5609): 10,
 (9188, 2709): 5,
 (2011, 32242): 10,
 (9343, 40546): 9,
 (9438, 42423): 8,
 (8373, 18022): 10,
 (5141, 28993): 9,
 (3528, 35363): 6,
 (5921, 12348): 9,
 (702, 3980): 5,
 (5986, 38865): 8,
 (3528, 39032): 9,
 (6204, 2243): 8,
 (1230, 41196): 1,
 (3837, 37489): 6,
 (2896, 25472): 10,
 (7606, 9676): 6,
 (8993, 36650): 8,
 (7608, 29096): 7,
 (3077, 5624): 8,
 (8373, 35229): 8