<h1>Goodereads<h1>

The dataset has
- 228M ratings
- 0.8M users
- 2.3M items
- Genres about books (genre tags are extracted from users' popular shelves by a simple keyword matching process)

In [16]:
import pandas as pd
from dask import dataframe as dd
import time
import pickle
import numpy as np



<h3>Exploring ratings first<h3>

In [35]:
start = time.time()
df_ratings = dd.read_csv("/home/diego/chat-reranking/dataset/goodreads/goodreads_interactions.csv")
end = time.time()
print(f"Read csv with dask: {end-start} sec")

Read csv with dask: 0.016644954681396484 sec


In [36]:
print(df_ratings.columns)
print(len(df_ratings))

Index(['user_id', 'book_id', 'is_read', 'rating', 'is_reviewed'], dtype='object')
228648342


In [37]:
df_ratings = df_ratings.rename(columns={"book_id": "item_id"})
print(df_ratings.columns)

Index(['user_id', 'item_id', 'is_read', 'rating', 'is_reviewed'], dtype='object')


In [38]:
df_ratings = df_ratings[['user_id', 'item_id', 'rating']]
print(df_ratings.columns)

Index(['user_id', 'item_id', 'rating'], dtype='object')


In [39]:
df_ratings = df_ratings[df_ratings["rating"] != 0]
len(df_ratings)

104551549

In [40]:
len(df_ratings["item_id"].unique())

2325541

In [41]:
len(df_ratings["user_id"].unique())

816371

In [42]:
upper_bound = 300
lower_bound = 70
to_keep = list(df_ratings['user_id'].value_counts()[lambda x: (x>lower_bound) & (x<upper_bound)].index)

In [43]:
df_ratings = df_ratings[df_ratings["user_id"].isin(to_keep)]
len(df_ratings)

39048860

In [44]:
len(df_ratings["user_id"].unique())

266891

In [45]:
len(df_ratings["item_id"].unique())

1593470

In [46]:
pandas_ratings = df_ratings.compute()
type(pandas_ratings)

pandas.core.frame.DataFrame

In [47]:
grouped_items = pandas_ratings.groupby("item_id")["item_id"].count()

In [48]:
# check how many ratings you keep by filtering out unpopular items
np.sum(grouped_items.sort_values(ascending=False)[:20000].values)

25489453

In [49]:
remaining_items = grouped_items.sort_values(ascending=False)[:20000].index
len(remaining_items)

20000

In [50]:
pandas_ratings = pandas_ratings[pandas_ratings["item_id"].isin(remaining_items)]
len(pandas_ratings)

25489453

In [51]:
len(pandas_ratings["user_id"].unique())

266561

In [52]:
# random.seed(895)
# n_to_sample = 300000
# users = random.sample(list(df_ratings['user_id'].unique()), n_to_sample)

In [53]:
# df_ratings = df_ratings[df_ratings["user_id"].isin(users)]
# len(df_ratings)

In [54]:
pandas_ratings['rating'].value_counts(normalize=True)

rating
5    0.379890
4    0.350483
3    0.197756
2    0.052855
1    0.019016
Name: proportion, dtype: float64

Around 28% of the ratings are negative. Now the dataset is around 18M

In [51]:
df_remaining = pd.DataFrame(data={"item_id": remaining_items})
df_remaining.to_csv("/home/diego/chat-reranking/dataset/goodreads/remaining_items.csv", index=False)

In [52]:
df_remaining = pd.read_csv("/home/diego/chat-reranking/dataset/goodreads/remaining_items.csv")
df_remaining.head(1)

Unnamed: 0,item_id
0,943


<h3>Exploring books' titles<h3>

In [9]:
import gzip
import datetime
import json

remaining_items = pd.read_csv("/home/diego/chat-reranking/dataset/goodreads/remaining_items.csv")["item_id"].values
print(f"# of remaining items: {len(remaining_items)}")
books = {"item_id": [], "title": [], "title_without_series": [], "publication_year": []}
counter = 0
with gzip.open("/home/diego/chat-reranking/dataset/goodreads/goodreads_books.json.gz", 'r') as f:
    while True:
        counter += 1
        if counter % 100000 == 0:
            # print(f"{print(datetime.datetime.now())}: line {counter}")
            print(f"len of dict: {len(books['item_id'])}")
        line = f.readline()
        if not line:
            break
        try:
            parsed_line = json.loads(line)
            book_id = int(parsed_line["book_id"])
            if book_id in remaining_items:
                books["item_id"].append(book_id)

                title, title_no_series, year = None, None, None
                if parsed_line["title"]:
                    title = parsed_line["title"]
                if parsed_line["title_without_series"]:
                    title_no_series = parsed_line["title_without_series"]
                if parsed_line["publication_year"]:
                    year = parsed_line["publication_year"]
                books["title"].append(title)
                books["title_without_series"].append(title_no_series)
                books["publication_year"].append(year)

        except Exception:
            print("exxxx")
            books["item_id"].append(None)
            books["title"].append(None)
            books["title_without_series"].append(None)
            books["publication_year"].append(None)
            continue

df_books = pd.DataFrame.from_dict(books)
df_books.to_csv("/home/diego/chat-reranking/dataset/goodreads/filtered_books.csv", index=False)
print(datetime.datetime.now())

# of remaining items: 20000
len of dict: 416
len of dict: 763
len of dict: 1126
len of dict: 1513
len of dict: 1873
len of dict: 2263
len of dict: 2603
len of dict: 2960
len of dict: 3326
len of dict: 3703
len of dict: 4040
len of dict: 4427
len of dict: 4802
len of dict: 5161
len of dict: 5506
len of dict: 5906
len of dict: 6318
len of dict: 6724
len of dict: 7067
len of dict: 7469
len of dict: 7824
len of dict: 8211
len of dict: 8579
2023-11-10 11:20:26.122989


In [11]:
df_books = pd.read_csv("/home/diego/chat-reranking/dataset/goodreads/filtered_books.csv")
print(f"Len of df {len(df_books)}")
df_books.head(1)

Len of df 8799


Unnamed: 0,item_id,title,title_without_series,publication_year
0,89377,Penny from Heaven,Penny from Heaven,2006.0


In [12]:
df_books.drop_duplicates(inplace=True)
df_books.drop_duplicates(subset="item_id", inplace=True)
df_books.drop_duplicates(subset="title", inplace=True)
print(f"Len of df {len(df_books)}")

Len of df 8349


In [13]:
# drop any book with no title
df_books.dropna(subset="title", inplace=True)
print(f"Len of df {len(df_books)}")

Len of df 8349


In [14]:
print(f"Most recent year of publication: {max(df_books['publication_year'].values)}")

Most recent year of publication: 2017.0


<h3>Exploring books genres <h3>

In [17]:
df_genres = dd.read_json("/home/diego/chat-reranking/dataset/goodreads/goodreads_book_genres_initial.json")
df_genres = df_genres.rename(columns={"book_id": "item_id"}).compute()

In [18]:
print(len(df_genres))
df_genres.head(1)

2360655


Unnamed: 0,item_id,genres
0,5333265,"{'history, historical fiction, biography': 1}"


In [19]:
genres = []
for i, row in df_genres.iterrows():
    if len(list(row["genres"].keys())) == 0:
        genres.append(None)
    else:
        genres.append(list(row["genres"].keys()))
len(genres)

2360655

In [20]:
df_genres["genres"] = genres
df_genres.dropna(subset="genres", axis=0, inplace=True)
len(df_genres)

1951142

In [21]:
def format_genres(list_gs):
    ggs = []
    for gg in list_gs:
        splitted = gg.split(", ")
        for s in splitted:
            ggs.append(s)
    return ggs
            
df_genres["formatted_genres"] = df_genres.apply(lambda r: format_genres(r["genres"]), axis=1)
df_genres.head(1)

Unnamed: 0,item_id,genres,formatted_genres
0,5333265,"[history, historical fiction, biography]","[history, historical fiction, biography]"


In [22]:
genres = set()
for g in df_genres["formatted_genres"].values:
    genres = genres.union(g)
print(len(genres))

16


In [23]:
print(genres)

{'romance', 'children', 'fiction', 'history', 'young-adult', 'fantasy', 'crime', 'historical fiction', 'poetry', 'comics', 'thriller', 'biography', 'graphic', 'paranormal', 'non-fiction', 'mystery'}


In [24]:
df_genres = df_genres[df_genres["item_id"].isin(remaining_items)]
len(df_genres)

8558

In [25]:
df_genres.dropna(subset="genres", axis=0, inplace=True)
len(df_genres)

8558

In [26]:
final_items = list(set(df_books["item_id"]).intersection(df_genres["item_id"]))
len(final_items)

8109

In [27]:
# df_books and df_genres ARE NOT aligned!
set(df_books["item_id"]) == set(df_genres["item_id"])

False

In [28]:
df_books = df_books[df_books["item_id"].isin(final_items)]
df_genres = df_genres[df_genres["item_id"].isin(final_items)]

In [29]:
# df_books and df_genres ARE aligned!
set(df_books["item_id"]) == set(df_genres["item_id"])

True

In [30]:
df_items = pd.merge(df_books, df_genres, on='item_id', how='outer')

In [31]:
print(df_items.shape)
df_items.head(1)

(8109, 6)


Unnamed: 0,item_id,title,title_without_series,publication_year,genres,formatted_genres
0,89377,Penny from Heaven,Penny from Heaven,2006.0,"[fiction, history, historical fiction, biograp...","[fiction, history, historical fiction, biograp..."


In [32]:
df_items.drop(columns=["genres"], inplace=True)

In [33]:
df_items.rename(columns={"formatted_genres": "genres"}, inplace=True)
df_items.columns

Index(['item_id', 'title', 'title_without_series', 'publication_year',
       'genres'],
      dtype='object')

In [34]:
df_items.rename(columns={"title_name": "item_name"}, inplace=True)
df_items.columns

Index(['item_id', 'title', 'title_without_series', 'publication_year',
       'genres'],
      dtype='object')

<h3>Craft the final datastet<h3>

In [55]:
df_interactions = pandas_ratings[pandas_ratings["item_id"].isin(df_items["item_id"].values)]
# remove users with less than 30 ratings
to_keep = list(df_interactions['user_id'].value_counts()[lambda x: x >=30].index)
df_interactions = df_interactions[df_interactions["user_id"].isin(to_keep)]

print(f"# of ratings: {len(df_interactions)}")
print(f"# of users: {len(df_interactions['user_id'].unique())}")
print(f"# of items: {len(df_interactions['item_id'].unique())}")

# of ratings: 8452752
# of users: 166481
# of items: 8109


In [56]:
genres = set()
for g in df_items["genres"].values:
    genres = genres.union(g)
print(len(genres))

16


In [57]:
print(genres)

{'romance', 'children', 'fiction', 'history', 'young-adult', 'fantasy', 'crime', 'historical fiction', 'poetry', 'comics', 'thriller', 'biography', 'graphic', 'non-fiction', 'paranormal', 'mystery'}


DATASET STATISTICS:
- 8M ratings
- 8k items
- 166k users
- 50 ratings per user
- 1042 ratings per item
- 16 genres
- 99.4% sparsity 

In [58]:
out_dir = "/home/diego/chat-reranking/experiments/goodreads/"

In [62]:
df_items.rename(columns={"title":"item_name"}, inplace=True)
df_items.head(1)

Unnamed: 0,item_id,item_name,title_without_series,publication_year,genres
0,89377,Penny from Heaven,Penny from Heaven,2006.0,"[fiction, history, historical fiction, biograp..."


In [63]:
df_items.to_csv(f"{out_dir}df_items.csv", sep=",", index=False)

In [64]:
itemid_to_name = {}
for i, n in zip(df_items["item_id"].values, df_items["item_name"].values):
    itemid_to_name[i] = n
itemname_to_id = {v: k for k, v in itemid_to_name.items()}
with open(f"{out_dir}itemid_to_name.pkl", 'wb') as fp:
    pickle.dump(itemid_to_name, fp)
with open(f"{out_dir}itemname_to_id.pkl", 'wb') as fp:
    pickle.dump(itemname_to_id, fp)

In [65]:
# prepare genres file
out_string = ""
for i, row in df_items.iterrows():
    for genre in row["genres"]:
        if row['item_name'] in itemname_to_id:
            out_string += f"{itemname_to_id[row['item_name']]}\t{genre}\n"
        else:
            print(row)

with open(f"{out_dir}genres_file.txt", "w") as text_file:
    text_file.write(out_string)

In [66]:
itemid_to_namegenres = {}
itemnamegenres_to_id = {}
for i, row in df_items.iterrows():
    item_id = row["item_id"]
    genres_str = ""
    item_name = row["item_name"]
    for g in row["genres"]:
        genres_str += f"{g}, "
    itemid_to_namegenres[item_id] = f"{item_name} ({genres_str[:-2]})"
    
    # to convert item names to id, the dict accepts both names only or names with genres
    itemnamegenres_to_id[f"{item_name} ({genres_str[:-2]})"] = item_id
    itemnamegenres_to_id[item_name] = item_id

In [67]:
with open(f"{out_dir}itemid_to_namegenres.pkl", 'wb') as fp:
    pickle.dump(itemid_to_namegenres, fp)
with open(f"{out_dir}itemnamegenres_to_id.pkl", 'wb') as fp:
    pickle.dump(itemnamegenres_to_id, fp)

In [68]:
out_dir = "/home/diego/chat-reranking/dataset/goodreads/"

In [69]:
df_interactions.to_csv(f"{out_dir}ratings8M.csv", index=False)

In [3]:
import pandas as pd
df_items = pd.read_csv("/home/diego/chat-reranking/experiments/goodreads/df_items.csv")
df_items.head(1)

Unnamed: 0,item_id,item_name,title_without_series,publication_year,genres
0,89377,Penny from Heaven,Penny from Heaven,2006.0,"['fiction', 'history', 'historical fiction', '..."


In [4]:
df_items[df_items["item_id"] == 536]

Unnamed: 0,item_id,item_name,title_without_series,publication_year,genres
7979,536,The Lovely Bones,The Lovely Bones,2006.0,"['fiction', 'mystery', 'thriller', 'crime', 'y..."


In [5]:
df_items[df_items["item_id"] == 536]["genres"]

7979    ['fiction', 'mystery', 'thriller', 'crime', 'y...
Name: genres, dtype: object

In [7]:
print(len(df_items))
df_items.drop_duplicates(subset="item_id", inplace=True)
print(len(df_items))

8558
8558


In [8]:
print(len(df_items))
df_items.drop_duplicates(subset="item_name", inplace=True)
print(len(df_items))

8558
8111
