In [2]:
# In my experience, many librarians are skeptical about AI tools.
# In many cases, I agree with this skepticism, but I do think there
# some good use cases. This is one simple example. Given a spreadsheet
# of books, it tries to guess whether each book is in a particular
# subject area or not (in this case, mathematics, statistics, or data
# science) based on the title. It is, of course, not perfect, but
# could be a great tool for a list which is too large to be worth
# the trouble for human review. In particular, this takes about a
# minute per 100 books, and this fast speed could be worth decreased
# accuracy if it means the classification gets done at all rather
# than not at all. One could also experiment with different models
# and different setups to get better results.
import pandas as pd
from transformers import pipeline
import time

In [7]:
df = pd.read_excel('AMSBooks.xlsx')
titles = df['Title'].tolist()

In [4]:
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
labels = ["mathematics", "statistics", "data science"]

In [13]:
start_time = time.time()
sample_size = 10  # Adjust sample size for a quick test
for title in titles[:sample_size]:
    for label in labels:
        result = classifier(title, candidate_labels=[label])
end_time = time.time()

# Timing estimate
total_time = end_time - start_time
average_time_per_title = total_time / sample_size
estimated_total_time = average_time_per_title * len(titles)

print(f"Estimated total run time: {estimated_total_time / 60:.2f} minutes for {len(titles)} titles")

Estimated total run time: 9.94 minutes for 1285 titles


In [14]:
for label in labels:
    df[f'{label} Score'] = 0.0
for i, title in enumerate(titles):
    for label in labels:
        result = classifier(title, candidate_labels=[label])
        df.at[i, f'{label} Score'] = result['scores'][0]

In [15]:
df.to_excel('classified_books.xlsx', index=False)