# Prepare the good-books dataset

Ratings of books by users

In [None]:
# Register the parent directory
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import anndata
import scanpy as sc
import numpy as np
import pandas as pd
import scipy.sparse as sparse

In [None]:
# On CETO
save_dir = 'path/to/data'
data_dir = 'path/to/goodbooks-10k/'
ratings_path = os.path.join(data_dir, 'ratings.csv')
books_path = os.path.join(data_dir, 'books.csv')


dat = pd.read_csv(ratings_path)
dat.rating.value_counts()
dat

In [None]:
# convert from tall to wide (user_id  book_id  rating -> user_id by book_id matrix)
dat = dat.pivot(index='user_id', columns='book_id', values='rating')
# fill na with 0
dat = dat.fillna(0)
user_ids = dat.index.values
book_ids = dat.columns.values

In [None]:
# read the anndata object
# change original title with title
books = pd.read_csv(books_path)
books['book_id'] = books['book_id'] - 1 
cols = ['book_id', 'title', 'authors', 'original_publication_year']
books = books[cols]

books = books.set_index('book_id')[['title']]

out_dir = os.path.join(save_dir, 'goodreads')
adata = sc.read(os.path.join(out_dir, 'goodreads.h5ad'))


adata.var['book_id'] = adata.var.index.values.astype(int)
adata.var.reset_index(drop=True, inplace=True)
adata.var = adata.var.join(books, on='book_id')
# assert there is no nan title
assert adata.var.title.isna().sum() == 0, 'There are nan titles'
adata.write(os.path.join(out_dir, 'goodreads.h5ad'))

In [None]:
# load the book names and add them as var_names
books = pd.read_csv(books_path)
# find names that are NAN
qq = books[books.original_title.isna()]
qq = books[books.title.isna()]
# find title that is nan
books[books.original_title == 'nan']
# tell me these...

cols = ['book_id', 'title', 'authors', 'original_publication_year']
books = books[cols]
books
# ensure book_ids are in books
assert np.all(np.isin(book_ids, books.book_id.values))

# now make them both start from zero
books.book_id = books.book_id - 1
book_ids = book_ids - 1

# assert that they are sorted the same way
assert np.all(books.book_id.values == book_ids)


In [None]:
# create an anndata object
X = sparse.csr_matrix(dat.values)
adata = anndata.AnnData(X=X, obs=pd.DataFrame(index=user_ids), var=pd.DataFrame(index=book_ids))
adata.layers['counts'] = adata.X.copy()
adata.obs['labels'] = np.random.choice(['A', 'B', 'C'], size=adata.shape[0])
# convert to int
adata.var['book_id'] = adata.var.index.values.astype(int)
# add book metadata
adata.var = adata.var.join(books.set_index('book_id'), on='book_id')
# set index to book_id
adata.var.set_index('book_id', inplace=True, drop=True)

In [None]:
# write 
out_dir = os.path.join(save_dir, 'goodreads')
os.makedirs(out_dir, exist_ok=True, )
adata.write(os.path.join(out_dir, 'goodreads.h5ad'))