In [1]:
import datasets
import polars as pl
from datetime import datetime


The datasets been generated using this [fork](https://github.com/baberabb/cce-python). The main logic is
for each registration and each matching renewal entry:

    1. check if date matches -> exact match
        2. if not then check if year matches
            3. if not then check if the normalized authors/title match


In [260]:
renewals_unmatched = datasets.load_dataset("baber/pdbooks", "renewals_unmatched")[
    "train"
].to_pandas()
renewals_matched = datasets.load_dataset("baber/pdbooks", "renewals_matched")[
    "train"
].to_pandas()
registrations_all = datasets.load_dataset("baber/pdbooks", "registrations_all")[
    "train"
].to_pandas()
registrations_unmatched = datasets.load_dataset(
    "baber/pdbooks", "registrations_not_renewed"
)["train"].to_pandas()

We'll use polars, but it's interchangeable with pandas with .to_pandas()

In [384]:
ren_unmatched = pl.from_pandas(renewals_unmatched)
ren_matched = pl.from_pandas(renewals_matched)
reg_all = pl.from_pandas(registrations_all)
reg_unmatched = pl.from_pandas(registrations_unmatched)

# print(f"Total renewals unmatched: {len(ren_unmatched)}")
print(f"Total registrations remaining: {len(reg_unmatched)}")

Total registrations remaining: 560350


These are alot of unmatched renewals but these include non-book entries as well. Sub-setting just the registration numbers we have in the reg dataset and filtering the dates we get ~8000 matched numbers with _all_ registrations and ~4000 with the registrations remaining (unmatched, not foreign, etc.). Looking at all registrations:

In [385]:
def filter_dates(x):
    return pl.col(x).list.eval(pl.element().str.to_date(strict=False, exact=False).is_between(datetime(1929, 1, 1), datetime(1964, 1, 1))).list.any() | pl.col(x).is_null() | pl.col(x).list.eval(pl.element().len() < 1).list.any()

def normalize_regnums(x):
    return pl.col(x).str.replace_all("-", "")

In [398]:
from polars import col

reg_all_ = (
    reg_all.lazy()
    .with_columns(pl.col(pl.Utf8).replace("", None))
    .filter(filter_dates("reg_dates"))
    .explode("regnums")
    .with_columns(normalize_regnums("regnums"))
)

ren_unmatched_ = (
    ren_unmatched.lazy()
    .with_columns(pl.col(pl.Utf8).replace("", None))
    .with_columns(
        # when title contains by and author is null then we'll just use title
        prompt_ren=pl.when(col("title").is_null())
        .then(col("full_text").str.split("©").list.first())
        .when(col("author").is_null() & col("title").str.contains("by"))
        .then(pl.format("{}", col("title")))
        # if both author and title then use "{title} by {author}"
        .when(col("author").is_not_null())
        .then(pl.format("{} by {}", col("title"), col("author")))
        # else by UNKNOWN
        .otherwise(pl.format("{} by {}", col("title"), pl.lit("UNKNOWN")))
    )
    .explode("regnum")
    .with_columns(normalize_regnums("regnum"))
    .join(
        ren_matched.lazy().select("uuid"),
        how="anti",
        on="uuid",
    )
    .filter(filter_dates("reg_date"))
    .rename({"uuid": "uuid_ren"})
)

In [404]:
final_df = (
    reg_all_.join(
        ren_unmatched_.lazy(), how="inner", left_on="regnums", right_on="regnum"
    )
    .with_columns(
        # this logic leaves out the case where there's only authors and publishers
        # 43 rows however none of them are a match
        prompt_reg=pl.when(col("authors").is_not_null())
        .then(
            pl.format(
                "{} by {}",
                col("title"),
                col("authors").list.join(" & "),
            )
        )
        .when(col("authors").is_null() & col("publishers").is_not_null())
        .then(
            pl.format(
                "{} published by {}", col("title"), col("publishers").list.join(" & ")
            )
        )
        .otherwise(pl.format("{} by UNKNOWN", pl.col("title")))
    )
    .unique(["prompt_reg", "prompt_ren"])
    .filter(pl.col("prompt_reg").is_not_null())
)

In [410]:
final_df.collect()

uuid,regnums,reg_dates,title,authors,publishers,disposition,year,group_title,group_uuid,notes,parent,new_matter_claimed,previous_publications,uuid_ren,reg_date,renewal_id,renewal_date,author,title_right,new_matter,see_also_renewal,see_also_registration,full_text,claimants,notes_right,prompt_ren,prompt_reg
str,str,list[str],str,list[str],list[str],str,str,str,str,list[str],str,list[str],list[str],str,list[str],str,str,str,str,str,list[null],list[null],str,str,str,str,str
"""FC1116C3-7454-…","""A181722""","[""1944-05-23""]","""Strangers in m…","[""Buirgy (Mary)""]",,"""Not renewed.""","""1944""",,,,,,,"""feedd024-b95d-…","[""1955-03-16""]","""RE173961""",,,"""Teacher's manu…",,[],[],,"""Holt, Rinehart…",,"""Teacher's manu…","""Strangers in m…"
"""063B6B35-7455-…","""A184743""","[""1944-11-30""]","""Year book of t…",,"[""Year book publishers, inc.""]","""Not renewed.""","""1944""",,,,,,,"""c9fca8e4-8a05-…","[""1954-09-23""]","""RE150894""",,,"""Smoke jumpers.…",,[],[],,"""Nels Jorgensen…",,"""Smoke jumpers.…","""Year book of t…"
"""3D6C9285-734A-…","""A176738""","[""1943-10-13""]","""Turquoise path…","[""Rousseau (Christine McConnell)""]","[""Broadman press""]","""Not renewed.""","""1943""",,,,,,,"""d0c17431-56e1-…","[""1955-02-23""]","""RE159740""",,,"""History of art…",,[],[],,"""Jean Anne Vinc…",,"""History of art…","""Turquoise path…"
"""428DAE0A-734A-…","""A170646""","[""1943-01-26""]","""American agric…","[""Barger (Harold)"", ""H. Barger"", ""Hans H. Landsberg""]","[""Natl. bureau of economic research, inc.""]","""Not renewed.""","""1943""",,,,,,,"""3586e70a-a649-…","[""1950-12-23""]","""RE7313""",,,"""For presenting…",,[],[],,"""Helen Seitter …",,"""For presenting…","""American agric…"
"""429007E2-734A-…","""A170652""","[""1943-01-18""]","""L’imitation de…","[""Lelen (J. M.)""]","[""Catholic book pub. co.""]","""Not renewed.""","""1943""",,,,,,,"""b08f6330-ef8f-…","[""1950-12-20""]","""RE7319""",,,"""Voices from th…",,[],[],,"""Helen Seitter …",,"""Voices from th…","""L’imitation de…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""5B4C5FCB-6E24-…","""A5865""",[],"""For all mankin…","[""Blum, Léon"", ""Léon Blum"", ""W. Pickles""]",,,"""1946""",,,,,,,"""e9d8c94e-c6db-…","[""1929-05-04""]","""R162097""","""1956-01-04""","""MILLARD, WILLI…","""The supplement…",,[],[],"""MILLARD, WILLI…","""William Barret…",,"""The supplement…","""For all mankin…"
"""620E6087-6E24-…","""AF1182""",[],"""… Histoire gén…",,,,"""1946""",,,"[""illus. (1 mounted col.; facsims., maps) plates (part mounted col.) 30×25cm. At head of title: Sous la directio. de mm. Maxims Gorce et Raoul Mortier. Contents.--[v. 1] Grèce. Rome.--[v. 2] Indo-Iraníens. Judaisme. Origines chrétianismes. Christianismes orientaux""]",,"[""v. 1""]",,"""2cdea580-668f-…","[""1944-06-30""]","""R511013""","""1971-08-31""","""MORTIER, RAOUL…","""Histoire gener…",,[],[],"""MORTIER, RAOUL…","""Mme Mortier, n…",,"""Histoire gener…","""… Histoire gén…"
"""632182AF-6E24-…","""A5117""",[],"""The wonderworl…","[""Knox, Warren William"", ""Warren Knox"", … ""Frank Hubbard""]","[""Charles Scribner's sons""]",,"""1946""",,,"[""Book 4 [5, 9]"", ""Book 9 by Morris Meistor, Ralph E. Keirstead and Lois M. Shoemaker""]",,"[""revisions, book 4, 5""]",,"""3e806bfa-24fe-…","[""1945-12-03""]","""R564543""","""1973-12-03""","""Alice Ormond C…","""With bated bre…",,[],[],"""R564543. With …","""Chita Ormond C…",,"""With bated bre…","""The wonderworl…"
"""26DAA602-6E16-…","""A""",[],"""Storm drift. […","[""Savage, Ethel Mary""]","[""E. M. Savage, St. Mary's, Sleepers Hill""]",,"""1930""",,,,,,,"""a98046c5-5a7d-…","[""1949-10-19""]","""R674241""","""1977-10-11""","""G. Paul Butler…","""Best sermons. …",,[],[],"""R674241. Best …","""G. Paul Butler…",,"""Best sermons. …","""Storm drift. […"


In [408]:
final_df.collect().write_parquet(
    "/Users/baber/PycharmProjects/cce-python/llm/test_matching/renewals_unmatched_for_llm.parquet"
)

Get the matched UUIDS. We have matched 518 renewals and 185 registration uuids. The registrations are alot less because most of them were "child" entries with a reg number but no uuid.
Matched 517 unique regnumbers.

In [412]:
import json

with open(
    "/Users/baber/PycharmProjects/cce-python/output/FINAL-registration_matches_from_llm.ndjson",
) as f:
    matches = []
    for entry in f:
        matches.append(json.loads(entry))

In [425]:
pl.from_dicts(matches).lazy().join(
    final_df,
    how="inner",
    left_on=["reg_uuid", "ren_uuid"],
    right_on=["uuid", "uuid_ren"],
    join_nulls=True,
).collect().unique("regnums")

reg_uuid,ren_uuid,regnums,reg_dates,title,authors,publishers,disposition,year,group_title,group_uuid,notes,parent,new_matter_claimed,previous_publications,reg_date,renewal_id,renewal_date,author,title_right,new_matter,see_also_renewal,see_also_registration,full_text,claimants,notes_right,prompt_ren,prompt_reg
str,str,str,list[str],str,list[str],list[str],str,str,str,str,list[str],str,list[str],list[str],list[str],str,str,str,str,str,list[null],list[null],str,str,str,str,str
"""28D8EBAE-7359-…","""6da685b8-be80-…","""A151239""","[""1941-03-04""]","""Pennsylvania G…","[""Pennsylvania German folklore soc""]",,"""Not renewed.""","""1941""",,,,,,,"[""1951-03-04""]","""R433307""","""1968-04-09""","""ZIEGLER, SAMUE…","""The Pennsylvan…",,[],[],"""ZIEGLER, SAMUE…","""Pennsylvania G…",,"""The Pennsylvan…","""Pennsylvania G…"
"""C7219039-7893-…","""d8670be3-dfa7-…","""A30335""","[""1948-11-21""]","""Modem wonders …","[""LEYSON, BURR WATKINS.""]","[""E. P. Dutton & Co., inc. New York""]","""Not renewed.""","""1949""",,,,,,,"[""1949-02-21""]","""R657956""","""1977-02-14""","""Burr W. Leyson…","""Modern wonders…",,[],[],"""R657956. Moder…","""E. P. Dutton|P…",,"""Modern wonders…","""Modem wonders …"
"""DB455811-728D-…","""468a7bb0-5e31-…","""A215427""","[""1955-12-01""]","""Pt.6-10.""",,,"""Not renewed.""","""1955""",,,,,,,"[""1952-06-06""]","""RE74054""",,,"""Immortal poems…",,[],[],,"""Gulf & Western…",,"""Immortal poems…","""Pt.6-10. by UN…"
"""18E952B8-6DC2-…","""35aa5636-148d-…","""A614541""","[""1962-02-20""]","""Western campsi…","[""SUNSET."", ""the editorial staff of Sunset books."", ""SUNSET.""]","[""Lane Book Co."", ""Lane Book Co.""]","""Not renewed.""","""1963""",,"""5066e611778449…",,,,,"[""1963-02-20""]","""RE542034""",,,"""Western campsi…",,[],[],,"""Sunset Publish…",,"""Western campsi…","""Western campsi…"
,"""ed9f857c-a0d3-…","""A106774""","[""1953-01-24"", ""1"", … ""7""]","""SUPERMAN.""","[""Wayne Boring.""]","[""National Comics Publications, inc.""]","""Not renewed.""",,,,,"""CD1CC570-6F17-…",,,"[""1951-01-24""]","""RE33061""",,,"""Superman. Rel…",,[],[],,"""Comics, Inc.|P…",,"""Superman. Rel…","""SUPERMAN. by W…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""BA260FBC-6C64-…","""c8178def-877b-…","""A69503""","[""1934-01-25""]","""Modern America…","[""Cal. R. Fisk""]","[""Modern American corporation"", ""Modern Amer. corporation.""]","""Not renewed.""","""1934""",,,"[""Paged continuously. Based on the New student’s reference work. Vol. 8 contains Lesson outlines and classified questions in nature-work, geography, history, biography, literature, the arts and sciences, industries and inventions, with page references.""]",,,,"[""1952-08-13""]","""RE57093""",,,"""The New Funk &…",,[],[],,"""Unicorn Press,…",,"""The New Funk &…","""Modern America…"
,"""db282bd1-3c8d-…","""A106740""","[""1953-05-17"", ""1"", … ""7""]","""SUPERMAN.""","[""Wayne Boring.""]","[""National Comics Publications, inc.""]","""Not renewed.""",,,,,"""CD1CC570-6F17-…",,,"[""1950-05-17""]","""RE5605""",,,"""Superman. Rel…",,[],[],,"""Comics, Inc.|P…",,"""Superman. Rel…","""SUPERMAN. by W…"
,"""4aa9b3f1-bfd2-…","""A266812""","[""1957-07-03"", ""1"", … ""9""]","""Donald Duck.""","[""DISNEY (WALT) PRODUCTIONS.""]","[""Walt Disney Productions.""]","""Not renewed.""",,,,"[""16-21Jul56""]","""293EB567-72C4-…",,,"[""1956-07-03""]","""RE199736""",,,"""Donald Duck. …",,[],[],,"""Walt Disney Pr…",,"""Donald Duck. …","""Donald Duck. b…"
,"""33e5e526-acdc-…","""A121032""","[""1954-11-17"", ""1"", … ""0""]","""SUPERMAN.""","[""Wayne Boring"", ""Superman""]","[""National Comics Publications, inc. Release for week of""]","""Not renewed.""",,,,"[""14Dec53""]","""DF9D4835-6F5B-…",,,"[""1953-11-17""]","""RE100901""",,,"""Superman. Rel…",,[],[],,"""Comics, Inc.|P…",,"""Superman. Rel…","""SUPERMAN. by W…"
