In [None]:
import os
import sys

MAIN_MODULE_PATH = os.path.join(os.getcwd(), '..', '..')
sys.path.append(MAIN_MODULE_PATH)

In [None]:
from collections import Counter

import pandas as pd
import seaborn as sns

from defaults import BOOK_RATINGS, BOOKS, USERS

In [None]:
ratings = pd.read_csv(BOOK_RATINGS, sep=';')
books = pd.read_csv(BOOKS, sep=';', error_bad_lines=False, index_col='ISBN')
users = pd.read_csv(USERS, sep=';', index_col='User-ID')

books = books.loc[pd.to_numeric(books['Year-Of-Publication'], errors='coerce').dropna().index]
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)

In [None]:
print("#ratings: ", len(ratings))
print("#books: ", len(books))
print("#users: ", len(users))

# Book ratings

In [None]:
ratings.head()

In [None]:
print(f"Missing rating ratio: {(ratings['Book-Rating']==0).sum()/len(ratings) :0.3f}")

In [None]:
ax = sns.histplot(data=ratings, x='Book-Rating', bins=10)
ax.set_title("global histogram of rating")

# Books
- book titles are duplicated (perhpas due to different edition), ISBN is unique

In [None]:
books_ratings_joined = pd.merge(ratings, books, left_on='ISBN', right_on='ISBN', how='left')
books['n_ratings'] = books_ratings_joined.groupby('ISBN')['Book-Rating'].size()
books['avg_rating'] = books_ratings_joined.groupby('ISBN')['Book-Rating'].mean()
books['n_ratings'].fillna(0, inplace=True)
books['avg_rating'].fillna(0, inplace=True)

In [None]:
books.head()

In [None]:
print(f"#books: {len(books)}")
print(f"#duplicated titles: {books['Book-Title'].duplicated().sum()}")
print(f"#unique_authors: {books['Book-Author'].nunique()}")
print(f"#year_of_pub_is_zero: {(books['Year-Of-Publication']==0).sum()}")

In [None]:
print(f"#books without a rating: {(books['n_ratings']==0).sum()}")
print(f"#books with avg_rating zero: {(books['avg_rating']==0).sum()}")
print(f"#books with avg_rating zero and user: {((books['avg_rating']==0) & (books['n_ratings']>0)).sum()}")

In [None]:
ax = sns.histplot(data=books, x='avg_rating', bins=20)
ax.set_title("Per book mean rating")

In [None]:
recent_books = books[(books['Year-Of-Publication']<2005) & (books['Year-Of-Publication']>1950)]
sns.histplot(data=recent_books, x='Year-Of-Publication')

# Users

In [None]:
users_ratings_joined = pd.merge(ratings, users, left_on='User-ID', right_index=True, how='left')
users['country'] = users['Location'].str.split(',', expand=True)[2].str.strip()
users['n_ratings'] = users_ratings_joined.groupby('User-ID').size()
users['avg_rating'] = users_ratings_joined.groupby('User-ID').mean()
users['n_ratings'].fillna(0, inplace=True)
users['avg_rating'].fillna(0, inplace=True)
users.head()

In [None]:
print(f"Missing age ratio:{users['Age'].isna().sum()/len(users) :0.3f}")

In [None]:
print(f"#users without a book: {(users['n_ratings']==0).sum()}")
print(f"#users with avg_rating zero: {(users['avg_rating']==0).sum()}")
print(f"#users with avg_rating zero and book: {((users['avg_rating']==0) & (users['n_ratings']>0)).sum()}")

In [None]:
print("Most common countries:\n", Counter(users['country']).most_common(10))

In [None]:
sns.histplot(data=users, x='avg_rating', bins=20)

In [None]:
ax = sns.scatterplot(data=users, x='Age', y='avg_rating')