In [None]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path.home() / "SEED_DATA/impact_and_fiction"
reviews_path = BASE_DIR / "reviews-stats.tsv"

reviews_db = pd.read_csv(reviews_path, sep="\t")
reviews_db

## Read the Book Table

In [None]:
book_db = pd.read_csv("data/book_topic.tsv", sep="\t", index_col=0)
book_db.index.name = 'isbn'
book_db.index = book_db.index.astype(str)
book_db

## Add ISBN Index to the Reviews Table
And keep only reviews that are form the "valid" books

In [None]:
import json

workId2isbn = json.load(open("data/workId2isbn.json"))

def get_isbn(work_id):
    return workId2isbn.get(work_id)

reviews_db['isbn'] = reviews_db['work_id'].apply(get_isbn)
reviews_db = reviews_db[~reviews_db['isbn'].isna()]
reviews_db = reviews_db.set_index('isbn')

reviews_db

In [None]:
## Check intersection between reviews_db and books_db
reviews_db.index.intersection(book_db.index)

## Merge Relevant Book Columns 

This is to have everything handy already in just one table

In [None]:
book_valid_columns = ['title', 'author', 'publisher', 'nur_names', 'topic_id', 'doc_x', 'doc_y', 'genre']

all_together = reviews_db.join(book_db[book_valid_columns], how='left', lsuffix='_rev', rsuffix='_book')
all_together['topic_id'] = all_together['topic_id'].astype(int)
all_together.to_csv("../content/topic_model_viz/all_valid_reviews_viz.tsv", sep="\t")
all_together

## VIEW: Most Popular Books

In [None]:
import numpy as np

def calculate_mode(series):
    mode_values = series.mode()
    if len(mode_values) == 1:
        return mode_values[0]
    else:
        return np.mean(mode_values.tolist()) 


per_book_reviews = reviews_db.groupby(['isbn'],)
# ratings_per_book = per_book_reviews['rating'].count().sort_values(ascending=False)
ratings_per_book = per_book_reviews['rating'].agg(
    num_ratings='count',
    average_rating='mean',
    median_rating='median',
    stdev_rating='std',
    mode_rating=calculate_mode
)
ratings_per_book = pd.DataFrame(ratings_per_book).sort_values('num_ratings', ascending=False)
ratings_per_book

In [None]:
title_author = book_db[['title', 'author']]
most_popular_books = ratings_per_book.join(title_author)
# most_popular_books.rename(columns={'rating': 'num_ratings'}, inplace=True)
most_popular_books.to_csv("data/most_popular_books.tsv", sep="\t")
most_popular_books