# Estimate the number of books missing from the dataset

We know that we have approximately 27 percent of the missing borrow events. Can we use this to determine how many books are missing from the dataset? Or how many books were in the S&Co library?

In [1]:
import pandas as pd
from datetime import datetime

from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

In [2]:
csv_urls = {
    'members': '../dataset_generator/data/SCoData_members_v1.1_2021-01.csv',
    'books': '../dataset_generator/data/SCoData_books_v1.1_2021-01.csv',
    'events': '../dataset_generator/data/SCoData_events_v1.1_2021-01.csv'

}

# load members, books, events as csv
members_df = pd.read_csv(csv_urls['members'])
books_df = pd.read_csv(csv_urls['books'])
events_df = pd.read_csv(csv_urls['events'])

In [16]:
# Calculation from 2-book-activity.ipynb
SURVIVING_BOOK_EVENT_ESTIMATE = 0.2734

# In the function psi of t, t is a multiple of the number of samples N.
#  To know what the number of books missing from the book events are, we need
#  to know the how many multiples of N we need to make our estimate.
total_book_event_multiplier = (1 - SURVIVING_BOOK_EVENT_ESTIMATE) / SURVIVING_BOOK_EVENT_ESTIMATE
total_book_event_multiplier

2.65764447695684

In [28]:
def stopping_criterion(U_is, delta):
    return len(U_is) >= 2 and U_is[-1] - U_is[-2] < delta

def psi_hat_at_infinity(N_ks, delta=0.0001):
    # Suggested algorithm by BBC, see Appendix B
    # Solving U ( 1 - exp(-N_1 / U)) = U_o
    ks_and_N_ks = [(i+1, v) for i, v in enumerate(N_ks)]
    U_o = sum([N_k*np.exp(-k) for k, N_k in ks_and_N_ks])
    U_is = [U_o]
    N_1 = N_ks[0]

    while not stopping_criterion(U_is, delta):
        if U_o >= N_1:
            U_i = 0
        U_i = U_o + U_is[-1] * np.exp(-N_1 / U_is[-1])
        U_is.append(U_i)
    
    return U_is[-1]

def psi_hat_t(t, N_ks):
    ks_and_N_ks = [(i+1, v) for i, v in enumerate(N_ks)]
    return sum([N_k*np.exp(-k) for k, N_k in ks_and_N_ks]) - sum([N_k*np.exp(-k*(1+t)) for k, N_k in ks_and_N_ks])

books_df = pd.read_csv(csv_urls['books'])
surviving_book_count = books_df.shape[0]

book_events_df = events_df[~events_df['item_uri'].isna()]
book_events_df = book_events_df[book_events_df['event_type'] == 'Borrow']
N_ks = book_events_df.groupby('item_uri')['item_uri'].count().sort_values(ascending=False).values
# N_ks = books_df['event_count'].sort_values(ascending=False).values

number_of_missing_book_event_books = psi_hat_t(total_book_event_multiplier, N_ks)
book_events_percent_surviving = round(surviving_book_count/(number_of_missing_book_event_books + surviving_book_count) * 100, 2)

number_of_missing_library_books = psi_hat_at_infinity(N_ks)
library_percent_surviving = round(surviving_book_count/(number_of_missing_library_books + surviving_book_count) * 100, 2)

print(f'Number of missing books from book events: {int(number_of_missing_book_event_books)}')
print(f'Percent surviving: {book_events_percent_surviving}')
print(f'Number of books missing from library: {int(number_of_missing_library_books)}')
print(f'Percent surviving: {library_percent_surviving}')

Number of missing books from book events: 59
Percent surviving: 99.03
Number of books missing from library: 82
Percent surviving: 98.64
