In [1]:
# review & refine matches from Loc catalog
import pandas as pd

In [30]:
matches = pd.read_csv("shxco_loc_matches.csv")
total_matches = matches.shape[0]
print(f"Reviewing {total_matches:,} matches")

Reviewing 4,981 matches


In [16]:
# catalog record has trailing slash on the title, remove it
matches["title"] = matches.title.apply(lambda x: x.strip(" /"))

In [75]:
# identify matches where title and author match almost exactly; these are fine, no need to review those
import re
from slugify import slugify

# remove series names from titles; split on opening paren and return whatever is first
matches["sco_title_clean"] = matches.sco_title.apply(lambda x: x.split("(")[0])
# remove subtitles after semicolon or colon
matches["sco_title_nosub"] = matches.sco_title.apply(lambda x: re.split(r'[;:] ', x)[0])

# use slugify to ignore differences in case, accent, punctuation
matches["title_exact_match"] = matches.apply(lambda row: slugify(row.sco_title) == slugify(row.title), axis=1)
matches["title_nosub_exact_match"] = matches.apply(lambda row: slugify(row.sco_title_nosub) == slugify(row.title), axis=1)

# replace empty author NaNs with empty string to simplify comparison
matches.author = matches.author.fillna("")
matches.sco_author = matches.sco_author.fillna("")
# this check probably only works with single authors, but that's fine
matches["author_exact_match"] = matches.apply(lambda row: row.sco_author.lower() == row.author.lower(), axis=1)
title_exact = matches[matches.title_exact_match]
title_nosub_exact = matches[matches.title_nosub_exact_match]
author_exact = matches[matches.author_exact_match]
title_author_exact = matches[matches.title_exact_match & matches.author_exact_match]
title_author_exact_total = title_author_exact.shape[0]
title_author_exact_total = title_author_exact.shape[0]

title_or_titlenosub_author_exact = matches[(matches.title_exact_match | matches.title_nosub_exact_match) & matches.author_exact_match]
title_or_titlenosub_author_exact_total = title_or_titlenosub_author_exact.shape[0]

# reconcile search was based on author, so if author is set AT ALL and title matches with or without subtitle, we can include without review
title_or_titlenosub_with_author = matches[(matches.title_exact_match | matches.title_nosub_exact_match) & matches.author.notna()]
title_or_titlenosub_with_author_total = title_or_titlenosub_with_author.shape[0]

# another case: title matches and year matches
matches["sco_date"] = matches.date.astype("str")
title_or_titlenosub_with_year = matches[(matches.title_exact_match | matches.title_nosub_exact_match) & (matches.sco_date == matches.date)]
title_or_titlenosub_with_year_total = title_or_titlenosub_with_year.shape[0]


print(f"{title_exact.shape[0]:,} with exact title match")
print(f"{title_nosub_exact.shape[0]:,} with exact title match when ignoring subtitles")
print(f"{author_exact.shape[0]:,} with exact author match")
print(f"{title_author_exact_total:,} with exact title and author match")
print(f"{title_or_titlenosub_author_exact_total:,} with exact title and author match when ignoring subtitles")
print(f"{title_or_titlenosub_with_author_total:,} with exact title and author is present, matching title with or without subtitles")

print(f"{title_or_titlenosub_with_year_total:,} with title match (with or without subtitle) and year match")

print(f"{total_matches - title_or_titlenosub_with_author_total:,} matches to review based on author/title match")
print(f"{total_matches - title_or_titlenosub_with_year_total:,} matches to review based on title/year match")


4,019 with exact title match
3,451 with exact title match when ignoring subtitles
3,898 with exact author match
3,270 with exact title and author match
3,309 with exact title and author match when ignoring subtitles
4,065 with exact title and author is present, matching title with or without subtitles
4,056 with title match (with or without subtitle) and year match
916 matches to review based on author/title match
925 matches to review based on title/year match


In [76]:
# identify matches that shoudl be reviewed
# - remove matches where title or subtitle matches and author is present
review_matches = matches[~((matches.title_exact_match | matches.title_nosub_exact_match) & matches.author.notna())].copy()
# - exclude any matches where title or subtitle matches and year matches
review_matches = review_matches[~(review_matches.title_exact_match | review_matches.title_nosub_exact_match) & (review_matches.sco_date == review_matches.date)]
review_matches

Unnamed: 0,sco_id,sco_title,title,sco_author,author,sco_date,date,id,subject,title_exact_match,author_exact_match,sco_title_clean,sco_title_nosub,title_nosub_exact_match
2,https://shakespeareandco.princeton.edu/books/n...,Gorboduc,The tragedy of Gorboduc,"Norton, Thomas","norton, thomas;john davis batchelder collectio...",1736,1736,http://lccn.loc.gov/96195076,tragedies,False,False,Gorboduc,Gorboduc,False
4,https://shakespeareandco.princeton.edu/books/s...,The Shepheard's Calendar,The faerie queen ; The shepheards calendar : t...,"Spenser, Edmund","spenser, edmund;george fabyan collection (libr...",1617,1617,http://lccn.loc.gov/96133755,knights and knighthood;poetry;virtues,False,False,The Shepheard's Calendar,The Shepheard's Calendar,False
5,https://shakespeareandco.princeton.edu/books/g...,The Defence of Conny-Catching,"The blacke bookes messenger, 1592. Cuthbert Co...","Greene, Robert","greene, robert",1966,1966,http://lccn.loc.gov/66007098,social life and customs;england;london (englan...,False,True,The Defence of Conny-Catching,The Defence of Conny-Catching,False
6,https://shakespeareandco.princeton.edu/books/s...,A Midsummer's Night's Dream,A midsummer night's dream,"Shakespeare, William","lake, james h.;shakespeare, william;ford, john r.",2010,2010,http://lccn.loc.gov/2015460287,a midsummer night's dream;athens (greece);man-...,False,False,A Midsummer's Night's Dream,A Midsummer's Night's Dream,False
12,https://shakespeareandco.princeton.edu/books/d...,La géométrie,La géométrie de René Descartes.,"Descartes, René","descartes, rené",1886,1886,http://lccn.loc.gov/2001554104,"paris;france;geometry, analytic",False,True,La géométrie,La géométrie,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,https://shakespeareandco.princeton.edu/books/m...,Works of Marlowe,"The works of Christopher Marlowe,","Marlowe, Christopher","marlowe, christopher",1970,1970,http://lccn.loc.gov/73118945,,False,True,Works of Marlowe,Works of Marlowe,False
4977,https://shakespeareandco.princeton.edu/books/m...,[Morand Book],Paul Morand.,"Morand, Paul","morand, paul",1966,1966,http://lccn.loc.gov/66076909,,False,True,[Morand Book],[Morand Book],False
4978,https://shakespeareandco.princeton.edu/books/j...,[something by E. B. C. Jones],[J.E. Jones],"Jones, E. B. C.","jones, j. e.;hartman, george",1938-11-03,1938-11-03,http://www.loc.gov/item/wpalh000861/,field notes;local history;narratives;life hist...,False,False,[something by E. B. C. Jones],[something by E. B. C. Jones],False
4979,https://shakespeareandco.princeton.edu/books/s...,[Steinbeck's new one],La perla,"Steinbeck, John","steinbeck, john;bernal granados, gabriel",2019,2019,http://lccn.loc.gov/2018034092,parables,False,False,[Steinbeck's new one],[Steinbeck's new one],False


In [77]:
# use levenshtein distance to calculate how similar the title and matched titles are
from Levenshtein import distance

# lower case, since we don't care about differences in case at this point
review_matches["title_dist"] = review_matches.apply(lambda row: distance(row.sco_title_clean.lower(), row.title.lower()), axis=1)

In [78]:
review_matches = review_matches.sort_values("title_dist", ascending=False)
review_matches[["sco_title", "sco_title_clean", "title", "title_dist", "sco_author", "author", "title_dist"]]

Unnamed: 0,sco_title,sco_title_clean,title,title_dist,sco_author,author,title_dist.1
2074,May Fair,May Fair,May Fair: being an entertainment purporting to...,326,"Arlen, Michael","arlen, michael",326
14,Vulgar Errors (Pseudodoxia Epidemica),Vulgar Errors,"The works of the learned Sr Thomas Brown, Kt.,...",313,"Browne, Thomas","browne, thomas",313
397,The American Commonwealth,The American Commonwealth,The proposal to add six additional justices to...,270,"Bryce, James",printed ephemera collection (library of congre...,270
4021,The Flying Wasp,The Flying Wasp,The flying wasp; a laughing look-over of what ...,269,"O'Casey, Sean","o'casey, sean",269
2196,Complete History of the Lives and Robberies of...,Complete History of the Lives and Robberies of...,A complete history of the lives and robberies ...,248,"Smith, Alexander","smith, alexander",248
...,...,...,...,...,...,...,...
2335,The Silver Spoon (A Modern Comedy),The Silver Spoon,The silver spoon /,1,"Galsworthy, John","galsworthy, john",1
2084,Original Letters from India (1779 – 1815),Original Letters from India,Original letters from India,1,"Fay, Eliza","fay, eliza;forster, e. m. (edward morgan)",1
4631,The Trap (Pilgrimage 8),The Trap,"The trap,",1,"Richardson, Dorothy M.","richardson, dorothy m. (dorothy miller)",1
4781,Clear Horizon (Pilgrimage 11),Clear Horizon,Clear horizon,1,"Richardson, Dorothy M.","richardson, dorothy m. (dorothy miller)",1


In [81]:
# load partial review done so far
partial_review = pd.read_csv("shxco_loc_matches_review_partial.csv")
# keep just the sco id, keep, and notes columns
partial_review = partial_review[["sco_id", "keep", "notes"]]
partial_review

Unnamed: 0,sco_id,keep,notes
0,https://shakespeareandco.princeton.edu/books/g...,1.0,
1,https://shakespeareandco.princeton.edu/books/c...,1.0,
2,https://shakespeareandco.princeton.edu/books/t...,1.0,
3,https://shakespeareandco.princeton.edu/books/s...,1.0,
4,https://shakespeareandco.princeton.edu/books/p...,1.0,
...,...,...,...
1667,https://shakespeareandco.princeton.edu/books/g...,,
1668,https://shakespeareandco.princeton.edu/books/s...,,
1669,https://shakespeareandco.princeton.edu/books/o...,,
1670,https://shakespeareandco.princeton.edu/books/v...,,


In [83]:
review_merge = pd.merge(review_matches, partial_review, on="sco_id")
review_merge

Unnamed: 0,sco_id,sco_title,title,sco_author,author,sco_date,date,id,subject,title_exact_match,author_exact_match,sco_title_clean,sco_title_nosub,title_nosub_exact_match,title_dist,keep,notes
0,https://shakespeareandco.princeton.edu/books/a...,May Fair,May Fair: being an entertainment purporting to...,"Arlen, Michael","arlen, michael",1925,1925,http://lccn.loc.gov/25010419,fiction;great britain;george v;london (england...,False,True,May Fair,May Fair,False,326,,
1,https://shakespeareandco.princeton.edu/books/b...,Vulgar Errors (Pseudodoxia Epidemica),"The works of the learned Sr Thomas Brown, Kt.,...","Browne, Thomas","browne, thomas",1686,1686,http://lccn.loc.gov/20007180,physicians;gardening;urn burial;english prose ...,False,True,Vulgar Errors,Vulgar Errors (Pseudodoxia Epidemica),False,313,,
2,https://shakespeareandco.princeton.edu/books/b...,The American Commonwealth,The proposal to add six additional justices to...,"Bryce, James",printed ephemera collection (library of congre...,1937,1937,http://lccn.loc.gov/2020782636,,False,False,The American Commonwealth,The American Commonwealth,False,270,,
3,https://shakespeareandco.princeton.edu/books/o...,The Flying Wasp,The flying wasp; a laughing look-over of what ...,"O'Casey, Sean","o'casey, sean",1971,1971,http://lccn.loc.gov/70173866,theater;english drama;20th century;history and...,False,True,The Flying Wasp,The Flying Wasp,False,269,1.0,
4,https://shakespeareandco.princeton.edu/books/s...,Complete History of the Lives and Robberies of...,A complete history of the lives and robberies ...,"Smith, Alexander","smith, alexander",1926,1926,http://lccn.loc.gov/27013632,great britain;thieves;crime and criminals;brig...,False,True,Complete History of the Lives and Robberies of...,Complete History of the Lives and Robberies of...,False,248,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,https://shakespeareandco.princeton.edu/books/g...,The Silver Spoon (A Modern Comedy),The silver spoon /,"Galsworthy, John","galsworthy, john",2007,2007,http://lccn.loc.gov/2009294787,forsyte family (fictitious characters);fiction,False,True,The Silver Spoon,The Silver Spoon (A Modern Comedy),False,1,,
900,https://shakespeareandco.princeton.edu/books/f...,Original Letters from India (1779 – 1815),Original letters from India,"Fay, Eliza","fay, eliza;forster, e. m. (edward morgan)",2010,2010,http://lccn.loc.gov/2009031818,correspondence;social life and customs;england...,False,False,Original Letters from India,Original Letters from India (1779 – 1815),False,1,,
901,https://shakespeareandco.princeton.edu/books/r...,The Trap (Pilgrimage 8),"The trap,","Richardson, Dorothy M.","richardson, dorothy m. (dorothy miller)",1925,1925,http://lccn.loc.gov/25019174,,False,False,The Trap,The Trap (Pilgrimage 8),False,1,,
902,https://shakespeareandco.princeton.edu/books/r...,Clear Horizon (Pilgrimage 11),Clear horizon,"Richardson, Dorothy M.","richardson, dorothy m. (dorothy miller)",1935,1935,http://lccn.loc.gov/36009943,autobiographical fiction;fiction;england;women,False,False,Clear Horizon,Clear Horizon (Pilgrimage 11),False,1,,


In [86]:
# how many left to review?
review_merge[~review_merge.keep.notna()]

Unnamed: 0,sco_id,sco_title,title,sco_author,author,sco_date,date,id,subject,title_exact_match,author_exact_match,sco_title_clean,sco_title_nosub,title_nosub_exact_match,title_dist,keep,notes
0,https://shakespeareandco.princeton.edu/books/a...,May Fair,May Fair: being an entertainment purporting to...,"Arlen, Michael","arlen, michael",1925,1925,http://lccn.loc.gov/25010419,fiction;great britain;george v;london (england...,False,True,May Fair,May Fair,False,326,,
1,https://shakespeareandco.princeton.edu/books/b...,Vulgar Errors (Pseudodoxia Epidemica),"The works of the learned Sr Thomas Brown, Kt.,...","Browne, Thomas","browne, thomas",1686,1686,http://lccn.loc.gov/20007180,physicians;gardening;urn burial;english prose ...,False,True,Vulgar Errors,Vulgar Errors (Pseudodoxia Epidemica),False,313,,
2,https://shakespeareandco.princeton.edu/books/b...,The American Commonwealth,The proposal to add six additional justices to...,"Bryce, James",printed ephemera collection (library of congre...,1937,1937,http://lccn.loc.gov/2020782636,,False,False,The American Commonwealth,The American Commonwealth,False,270,,
4,https://shakespeareandco.princeton.edu/books/s...,Complete History of the Lives and Robberies of...,A complete history of the lives and robberies ...,"Smith, Alexander","smith, alexander",1926,1926,http://lccn.loc.gov/27013632,great britain;thieves;crime and criminals;brig...,False,True,Complete History of the Lives and Robberies of...,Complete History of the Lives and Robberies of...,False,248,,
7,https://shakespeareandco.princeton.edu/books/g...,The Gold of [unclear],Original Unpublished Collection of Maps Relati...,,"martínez compañon y bujanda, baltazar jaime",1782,1782,http://www.loc.gov/item/2021667799/,missionaries;missions;codex;cities and towns;1...,False,False,The Gold of [unclear],The Gold of [unclear],False,225,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
898,https://shakespeareandco.princeton.edu/books/g...,The Man of Property (The Forsyte Saga),The man of property.,"Galsworthy, John","galsworthy, john",1921,1921,http://lccn.loc.gov/25022191,forsyte family (fictitious characters);fiction,False,True,The Man of Property,The Man of Property (The Forsyte Saga),False,1,,
899,https://shakespeareandco.princeton.edu/books/g...,The Silver Spoon (A Modern Comedy),The silver spoon /,"Galsworthy, John","galsworthy, john",2007,2007,http://lccn.loc.gov/2009294787,forsyte family (fictitious characters);fiction,False,True,The Silver Spoon,The Silver Spoon (A Modern Comedy),False,1,,
900,https://shakespeareandco.princeton.edu/books/f...,Original Letters from India (1779 – 1815),Original letters from India,"Fay, Eliza","fay, eliza;forster, e. m. (edward morgan)",2010,2010,http://lccn.loc.gov/2009031818,correspondence;social life and customs;england...,False,False,Original Letters from India,Original Letters from India (1779 – 1815),False,1,,
901,https://shakespeareandco.princeton.edu/books/r...,The Trap (Pilgrimage 8),"The trap,","Richardson, Dorothy M.","richardson, dorothy m. (dorothy miller)",1925,1925,http://lccn.loc.gov/25019174,,False,False,The Trap,The Trap (Pilgrimage 8),False,1,,


In [57]:
# save to csv for manual review; specify encoding so we get byte order mark for excel
review_matches.to_csv("shxco_loc_matches_review.csv", index=False, encoding='utf-8-sig')

In [87]:
# save to csv for manual review; specify encoding so we get byte order mark for excel
review_merge.to_csv("shxco_loc_matches_review.csv", index=False, encoding='utf-8-sig')

In [95]:
# manually reviewed the ones that were questionable and indicated keep yes or no
# use the manual review to filter the full set of matches

reviewed = pd.read_csv("shxco_loc_matches_reviewed.csv")
matches = pd.read_csv("shxco_loc_matches.csv")

final_matches = matches[~matches.sco_id.isin(reviewed[reviewed.keep == 0].sco_id)]
final_matches

Unnamed: 0,sco_id,sco_title,title,sco_author,author,sco_date,date,id,subject
0,https://shakespeareandco.princeton.edu/books/m...,Le Morte d'Arthur,Le morte d'Arthur /,"Malory, Thomas","malory, thomas",1485.0,1985,http://lccn.loc.gov/85240163,arthurian romances
1,https://shakespeareandco.princeton.edu/books/m...,Utopia,Utopia /,"More, Thomas","more, thomas",1516.0,1999,http://lccn.loc.gov/00268291,utopias;early works to 1800
2,https://shakespeareandco.princeton.edu/books/n...,Gorboduc,The tragedy of Gorboduc,"Norton, Thomas","norton, thomas;john davis batchelder collectio...",1561.0,1736,http://lccn.loc.gov/96195076,tragedies
3,https://shakespeareandco.princeton.edu/books/l...,Euphues: The Anatomy of Wit,Euphues. The anatomy of wit.,"Lyly, John","lyly, john",1578.0,1868,http://lccn.loc.gov/12008363,triangles (interpersonal relations);naples (it...
4,https://shakespeareandco.princeton.edu/books/s...,The Shepheard's Calendar,The faerie queen ; The shepheards calendar : t...,"Spenser, Edmund","spenser, edmund;george fabyan collection (libr...",1579.0,1617,http://lccn.loc.gov/96133755,knights and knighthood;poetry;virtues
...,...,...,...,...,...,...,...,...,...
4974,https://shakespeareandco.princeton.edu/books/j...,Work in Progress,Work in progress.,"Joyce, James","joyce, james",,1927,http://lccn.loc.gov/28001908,
4975,https://shakespeareandco.princeton.edu/books/l...,Works of Charles Lamb,The works of Charles Lamb.,"Lamb, Charles","lamb, charles",,1840,http://lccn.loc.gov/unk83030218,
4976,https://shakespeareandco.princeton.edu/books/m...,Works of Marlowe,"The works of Christopher Marlowe,","Marlowe, Christopher","marlowe, christopher",,1970,http://lccn.loc.gov/73118945,
4977,https://shakespeareandco.princeton.edu/books/m...,[Morand Book],Paul Morand.,"Morand, Paul","morand, paul",,1966,http://lccn.loc.gov/66076909,


In [96]:
final_matches.to_csv("shxco_loc_matches_final.csv", index=False)