In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from bitdotio_pandas import BitDotIOPandas
import seaborn as sns
import os
sns.set(font_scale=1.5, style='whitegrid')
plt.rcParams["font.family"] = "sans serif"

RDATA = os.path.join('data', 'raw')
PDATA = os.path.join('data', 'processed')

# Style configuration
COLORS = [
    '#0059ff',
    '#fdbd28',
    '#28D9AA',
    '#EE5149',
    '#060F41',
    '#788995',
    '#FF69B4',
    '#7F00FF',
]
GREY = '#788995'
DARK_GREY = '#060F41'
BLUE = '#0059ff'
DBLUE = '#060F41'
GOLD = '#fdbd28'
GREEN = '#28D9AA'
RED = '#EE5149'
BLACK = '#000000'
WHITE = '#FFFFFF'
LINEWIDTH = 5
LINESPACING = 1.25
FS_SUPTITLE = 30
FS_CAPTION = 24
FS_LABEL = 24
FS_FOOTNOTE = 20

## Part 1: Summer Reading Pipeline

Caution: this dataset is too large to download on Deepnote, but is provided here for reference or for download to use locally. 

The purpose of this notebook is to download raw checkout records from the Seattle Public Library and publish prepared datasets back to bit.io.

### Download data from SPL

In [7]:
for directory in ['data/raw', 'data/processed']:
    if not os.path.exists(directory):
        os.makedirs(directory)

In [8]:
# This step will take a while, the raw file is ~8.3 GB
! curl -o data/raw/checkouts_by_title.csv "https://data.seattle.gov/api/views/tmmm-ytt6/rows.csv?accessType=DOWNLOAD"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 7913M    0 7913M    0     0  5716k      0 --:--:--  0:23:37 --:--:-- 5006k0     0  4566k      0 --:--:--  0:00:04 --:--:-- 4565k   0 --:--:--  0:00:20 --:--:-- 4491k  0  5949k      0 --:--:--  0:00:57 --:--:-- 6478k:09 --:--:-- 5922k 0     0  5965k      0 --:--:--  0:01:10 --:--:-- 6023k   0  5898k      0 --:--:--  0:01:29 --:--:-- 4882k993k      0 --:--:--  0:02:30 --:--:-- 3047k    0  4761k      0 --:--:--  0:02:52 --:--:-- 4626k-:--:-- 4827k9k      0 --:--:--  0:03:57 --:--:-- 4599k0  4576k      0 --:--:--  0:04:02 --:--:-- 4460k 4583k      0 --:--:--  0:04:04 --:--:-- 4900k      0 --:--:--  0:04:16 --:--:-- 5659kM    0     0  4588k      0 --:--:--  0:04:23 --:--:-- 3938k:--:--  0:04:32 --:--:-- 5685k --:--:--  0:05:06 --:--:-- 4269k 0  4759k      0 --:--:--  0:05:59 --:--:-- 5638k--:--:--  0:06:35 --:--:-- 5136kM    0     

### Load data into pandas

In [9]:
df = pd.read_csv(os.path.join(RDATA, 'checkouts_by_title.csv'))
df.head()

Unnamed: 0,UsageClass,CheckoutType,MaterialType,CheckoutYear,CheckoutMonth,Checkouts,Title,Creator,Subjects,Publisher,PublicationYear
0,Physical,Horizon,BOOK,2008,12,1,The dream : how I learned the risks and reward...,"Chahal, Gurbaksh","Success in business, Entrepreneurship, Interne...","Palgrave Macmillan,",2008.0
1,Physical,Horizon,BOOK,2008,12,1,Vim vinegar,,"Vinegar, Cookery Vinegar, House cleaning",,
2,Physical,Horizon,BOOK,2008,12,1,biology of violence how understanding the brai...,,"Brain chemistry, Violence Physiological aspect...",,
3,Physical,Horizon,BOOK,2008,12,2,Courage the joy of living dangerously,,Courage Religious aspects,,
4,Physical,Horizon,BOOK,2008,12,1,Cherries in the snow a novel of lust love loss...,,"Love stories, Loss Psychology Fiction, Female ...",,


### Filtering the data  

I will look at the last ~5 years of ebook and audiobook checkouts.

In [None]:
df = df.loc[(df['MaterialType'].isin(['EBOOK', 'AUDIOBOOK'])) & (df['CheckoutYear'] >= 2016)].copy()
df.head()

### Getting the top subjects  

Each title can have many subject labeles. I will find the most popular subject labels for both ebooks and audiobooks. 

In [None]:
# Count subjects and map subjects to rows, this is crude/slow but works, could be parallelized easily 
subject_counts_ab = defaultdict(int)
subject_counts_eb = defaultdict(int)

i = 0
for idx, row in df.iterrows():
    if isinstance(row.Subjects, str):
        temp_subjects = row.Subjects.split(', ')
        for subject in temp_subjects:
            if row.MaterialType == 'EBOOK':
                subject_counts_eb[subject] += row.Checkouts
            elif row.MaterialType == 'AUDIOBOOK':
                subject_counts_ab[subject] += row.Checkouts
    i += 1
    if i % 1000000 == 0:
        print(f'{100*i/df.shape[0]:.3f}%')

In [None]:
# Construct series with subject counts
sca_series = pd.Series(subject_counts_ab)
sce_series = pd.Series(subject_counts_eb)

In [None]:
# View top subjects for audio books
top_subjects_a = list(sca_series.sort_values(ascending=False).iloc[:15].index)
top_subjects_a

In [None]:
# View top subjects for e-books
top_subjects_e = list(sce_series.sort_values(ascending=False).iloc[:15].index)
top_subjects_e

In [None]:
# Get top subjects across both
top_subjects = set(top_subjects_a).intersection(top_subjects_e)
top_subjects

In [None]:
# Create convenience columns for top subjects of interest, further denormalizing the dataset
for subject in top_subjects:
    df[subject] = df['Subjects'].str.contains(subject)
    
# Drop unneeded columns and clean up column names
df = df.drop(columns=['UsageClass', 'CheckoutType'])
df.columns = [col.lower() for col in df.columns]
df = df.rename(columns = {'materialtype': 'material_type',
                          'checkoutyear': 'checkout_year',
                          'checkoutmonth': 'checkout_month',
                          'publicationyear': 'publication_year',
                          'juvenile fiction': 'juvenile_fiction',
                          'biography & autobiography': 'biography_autobiography',
                          'science fiction': 'science_fiction',
                          'historical fiction': 'historical_fiction'})
df.head()

In [None]:
# Split up the audiobook and ebook tables
df = df.drop(columns='material_type')
dfa = df.loc[df['material_type'] == 'AUDIOBOOK'].copy()
dfe = df.loc[df['material_type'] == 'EBOOK'].copy()

### Store the cleaned-up data

To get an API key to connect to bit.io, go [here](https://bit.io/bitdotio/seattle_library) (sign up for a free account if needed, and click "connect" above the data preview).

In [None]:
# Create connection to bit.io, you will need your own API key to connect
bpd = BitDotIOPandas(username="bitdotio", repo="seattle_library")

In [None]:
# Upload audiobook records in chunks, if you'd like to write to bit.io you'll need to point this at a repo that you can write to
if 'audiobook_checkouts_by_title_test' in bpd.list_tables():
    bpd.delete_table('audiobook_checkouts_by_title_test')
bpd.to_table(dfa, 'audiobook_checkouts_by_title_test', chunksize=50000)

In [None]:
# Upload audiobook records in chunks, if you'd like to write to bit.io you'll need to point this at a repo that you can write to
if 'ebook_checkouts_by_title_test' in bpd.list_tables():
    bpd.delete_table('ebook_checkouts_by_title_test')
bpd.to_table(dfe, 'ebook_checkouts_by_title_test', chunksize=50000)

### Next, we continue our work in summer_reading_analysis.ipynb