# Counting volumes published during an author's career

The goal here is to get a rough sense of how widely an author was purchased -- especially by libraries -- during his or her career. We don't necessarily want to punish authors for dying, so we don't need to mess with death dates. Instead we'll roughly define a "career" as a period thirty years before, and after, a "midcareer" date — which is simply the mean publication date recorded for that author in a set of clean metadata dated as close as possible to first publication.

In other words, say our "clean" metadata includes four volumes by Thomas Hardy: *Desperate Remedies,* dated to 1874, and three vols of *Tess,* dated to 1891. The mean date will be 1887, so we'll count volumes by Thomas Hardy published between 1857 and 1917. It's not a perfect system, but it's going to capture much of his contemporary popularity, without counting reprints that were driven more by twentieth-century academic fashion. We also set hard limits at 1835 and 1965.

The metadata I use is a list of fiction volumes in HathiTrust; since HathiTrust often includes multiple copies of an edition -- but by no means covers every copy in every library -- this ends up being somewhere between a count of *editions* and a count of *individual volumes.*

A lot of the effort below goes into capturing name variants. Messy details that have to be handled include pseudonyms and initials.


In [14]:
# Counting author occurrences is complicated by the Augean stable
# that is real-world metadata. Authors' names can be spelled differently,
# with initials or names spelled out,
# there can be pseudonyms, authors listed under their husband's name, etc.

from collections import Counter
import pandas as pd
import csv

def forceint(astring):
    try:
        intval = int(astring)
    except:
        intval = 0

    return intval

def start_the_same(oneauthor, anotherauthor):
    if len(oneauthor) < 5 or len(anotherauthor) < 5:
        return False
    elif oneauthor[0:4].lower() == anotherauthor[0:4].lower():
        return True
    else:
        return False

# I've got a database of about 1600 works that I've manually cleaned to
# be relatively consistent. But now I want to rediscover the mess, in order to
# create a dictionary of aliases.

aliases = dict()

# We get aliases from multiple sources. There is a special pseudonym file:

with open('pseudonyms.csv', encoding = 'utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        aliases[row['pseudonym']] = row['ourname']

# Another source of aliases is in the clean database itself. I have
# a column "othername" that records pseudonyms (or real names in cases
# where the pseudonym is more common).

# While gathering aliases from the clean data we will also 
# initialize some dictionaries to use later.

authorcounts = Counter()
# that's where we will ultimately gather our data
existingauthors = set()

title2name = dict()
docid2name = dict()
# these mappings will help us generate additional aliases

publicationdatesforauth = dict()

def harvest_info(df, existingauthors, title2name, docid2name, publicationdatesforauth, aliases):
    ''' Gathers information from a single file.
    '''
    for i in df.index:
        mainname = df.loc[i, 'author'].strip(', .')
        if '.' in mainname:
            spacedout = mainname.replace('.', '. ')
            aliases[spacedout] = mainname
            condensed = mainname.replace('. ', '.')
            aliases[condensed] = mainname
            
        existingauthors.add(mainname)
        othername = str(df.loc[i, 'othername'])

        title = df.loc[i,'title'].strip(', .')
        docid = df.loc[i, 'docid']

        date = forceint(df.loc[i, 'earliestdate'])
        if mainname not in publicationdatesforauth:
            publicationdatesforauth[mainname] = []
        publicationdatesforauth[mainname].append(date)

        docid2name[docid] = mainname
        if len(title) > 5:
            title2name[title] = mainname

        if len(othername) < 4:
            continue
        elif othername == 'Mrs':
            othername = mainname + ',' + 'Mrs'
            aliases[othername] = mainname.strip()
        elif ',' not in othername:
            # this is not in lastname, firstnames order yet
            words = othername.rpartition(' ')
            # that divides only on the *last* space
            # e.g.
            # 'Edgar Allan Poe'.rpartition(' ')
            # Out[13]: ('Edgar Allan', ' ', 'Poe')

            if len(words) < 2:
                aliases[othername] = mainname
            else:
                othername = words[1] + ", " + words[0]
                aliases[othername] = mainname
        else:
            # this is already in right order
            aliases[othername] = mainname

cleanmeta = pd.read_csv('../fiction/prestigeficmeta.csv')
bestsellermeta = pd.read_csv('../sales/bestsellermetadata.csv')

harvest_info(cleanmeta, existingauthors, title2name, docid2name, publicationdatesforauth, aliases)
harvest_info(bestsellermeta, existingauthors, title2name, docid2name, publicationdatesforauth, aliases)

# Now that we have mappings of docids and titles to clean author names, we can use
# a bigger messier database to find aliases.

# We'll go through a larger database of 100,000 vols, and
# every time we find a book that we know is by author X, with a subtly different author
# name (Y), we can create an alias: aliases[Y] = X.

print('getting aliases')

def find_alias(docid, auth, title, docid2name, title2name, aliases, existingauthors):
    ''' Using metadata drawn from a single row of a database, decides
    whether this author name (auth) should be considered an alias for another
    author we had in our clear metadata.
    '''
    if docid in docid2name and auth not in existingauthors:
        if auth != docid2name[docid] and len(auth) > 3:
            aliases[auth] = docid2name[docid]

    if title in title2name and auth not in existingauthors:
        wehadthisbookwrittenby = title2name[title]

        # We have a book with a title that also occurred in our
        # clean metadata. If it's listed under a variant version
        # of the same author, let's create an alias. That means we have to
        # ensure, first, that it's not exactly the same

        if auth == wehadthisbookwrittenby:
            # the book is listed under the same name in the clean metadata
            # no alias needed
            return

        # and second, that it's close enough to be the same person
        if start_the_same(auth, wehadthisbookwrittenby):
            aliases[auth] = wehadthisbookwrittenby
            
print(len(aliases))
            
with open('/Users/tunder/work/genre/metadata/ficmeta.csv', encoding = 'latin-1') as f:
    reader = csv.DictReader(f)
    for row in reader:
        docid = row['htid']
        auth = row['author'].strip(', .')
        title = row['title'].split('|')[0].strip(', .')
        find_alias(docid, auth, title, docid2name, title2name, aliases, existingauthors)
        
print(len(aliases))
        
with open('/Users/tunder/Dropbox/python/train20/subfiction/filteredfiction.csv') as f:
    reader = csv.DictReader(f)
    for row in reader:
        date = int(row['inferreddate'])
        if date > 1959:
            continue
        docid = row['docid']
        auth = row['author'].strip(', .')
        title = row['title'].split('|')[0].strip(', .')
        find_alias(docid, auth, title, docid2name, title2name, aliases, existingauthors)
        
print(len(aliases))

# Now we're going to actually count authors' works in univ. libraries.
# But we're only going to do that for a 50-year span we call the author's
# career.

careers = dict()

for authorname, pubdates in publicationdatesforauth.items():
    if len(pubdates) < 1:
        print("that shouldn't happen")
    thiscareer = dict()
    thiscareer['midpoint'] = sum(pubdates) / len(pubdates)
    thiscareer['start'] = thiscareer['midpoint'] - 30
    thiscareer['end'] = thiscareer['midpoint'] + 31
    
    # But we only allow careers to go ten years beyond our
    # timeline endpoints. We're placing this limit to echo the
    # constraints on our bestseller lists.
    
    if thiscareer['start'] < 1835:
        thiscareer['start'] = 1835
    if thiscareer['end'] > 1965:
        thiscareer['end'] = 1965
    
    careers[authorname] = thiscareer

# okay, now we can count references

# I know I'm not being super-efficient about disk access, but
# this is not a frequently-run process, and premature
# optimization is the root of all evil.

def checkrow(auth, date, existingauthors, careers, authorcounts, aliases):
    if auth in existingauthors:
        if date >= careers[auth]['start'] and date < careers[auth]['end']:
            authorcounts[auth] += 1
    elif auth in aliases:
        correct_author = aliases[auth]
        if date >= careers[correct_author]['start'] and date < careers[correct_author]['end']:
            authorcounts[correct_author] += 1

with open('/Users/tunder/work/genre/metadata/ficmeta.csv', encoding = 'latin-1') as f:
    reader = csv.DictReader(f)
    for row in reader:
        date = forceint(row['startdate'])
        auth = row['author'].strip(', .')
        checkrow(auth, date, existingauthors, careers, authorcounts, aliases)

with open('/Users/tunder/Dropbox/python/train20/subfiction/filteredfiction.csv') as f:
    reader = csv.DictReader(f)
    for row in reader:
        date = forceint(row['inferreddate'])
        auth = row['author'].strip(',. ')
        checkrow(auth, date, existingauthors, careers, authorcounts, aliases)

with open('/Users/tunder/Dropbox/pulp/pulpstories.csv', encoding = 'latin-1') as f:
    reader = csv.DictReader(f)
    for row in reader:
        date = forceint(row['magpubdate'])
        auth = row['authorname']
        checkrow(auth, date, existingauthors, careers, authorcounts, aliases)

with open('/Users/tunder/Dropbox/pulp/novelsupplement.csv', encoding = 'latin-1') as f:
    reader = csv.DictReader(f)
    for row in reader:
        date = forceint(row['serialend'])
        auth = row['authorname']
        checkrow(auth, date, existingauthors, careers, authorcounts, aliases)
    
print('Done.')
print(len(authorcounts))
with open('career_volumes.csv', mode = 'w', encoding = 'utf-8') as f:
    scribe = csv.DictWriter(f, fieldnames = ['author', 'raw_num_vols', 'midcareer'])
    scribe.writeheader()
    for author in existingauthors:
        if author == '<blank>':
            continue
        thisrow = dict()
        thisrow['author'] = author
        thisrow['raw_num_vols'] = authorcounts[author]
        thisrow['midcareer'] = int(careers[author]['midpoint'])
        scribe.writerow(thisrow)

getting aliases
302
343
386
Done.
1040
