In [None]:
import difflib
import re


with open("../src/citeline/helpers/acl_20251219_113235.log", "r") as f:
    log_data = f.readlines()

pattern = re.compile(r"No close matches found for title: (.*?) among fuzzy candidates")

In [16]:
with open("../src/citeline/helpers/acl_20251219_123521.log", "r") as f:
    log_data = f.readlines()

error_lines, warning_lines = [], []
for line in log_data:
    if "- ERROR -" in line:
        error_lines.append(line)
    elif "- WARNING -" in line:
        warning_lines.append(line)

print(f"Got {len(error_lines)} error lines")
print(f"Got {len(warning_lines)} warning")

Got 2 error lines


In [18]:
pattern = re.compile(r"Multiple matches on '(.*?)'; choosing (.*?)$")
title_pairs = [m.groups() for line in warning_lines if (m := re.search(pattern, line))]
for i in range(3):
    print(title_pairs[i])

('A Shallow Model of Backchannel Continuers in Spoken Dialogue', 'A model of back-channel acknowledgements in spoken dialogue')
('Going to the Roots of Dependency Parsing', 'Squibs: Going to the Roots of Dependency Parsing')
('DESIGN OF LMT: A PROLOG-BASED MACHINE TRANSLATION SYSTEMMichael C. McCord', 'Design of LMT: A Prolog-Based Machine Translation System')


In [28]:
title_pairs.sort(key=lambda x: difflib.SequenceMatcher(None, x[0].lower(), x[1].lower()).ratio())

In [29]:
for query, chosen in title_pairs:
    ratio = difflib.SequenceMatcher(None, query.lower(), chosen.lower()).ratio()
    print(f"Query: {query}\nChosen: {chosen}\nSimilarity ratio: {ratio:.4f}\n----")

Query: asherecgs.ut exas .edu
Chosen: Speech Data Base
Similarity ratio: 0.1579
----
Query: Grammar-based Corpus AnnotationStefanie DipperInstitut fiir maschinelle SprachverarbeitungUniversitat Stuttgart
Chosen: Grammar-Based Concept Alignment for Domain-Specific Machine Translation
Similarity ratio: 0.3077
----
Query: Inducing Temporal GraphsPhilip Bramsen Pawan Deshpande Yoong Keok Lee Regina BarzilayMIT CSAIL MIT CSAIL DSO National Laboratories MIT CSAIL
Chosen: Proceedings of TextGraphs-9: the workshop on Graph-based Methods for Natural Language Processing
Similarity ratio: 0.3390
----
Query: European Media Lab GmbHSchloss-Wolfsbrunnenweg 31cD-69118 Heidelberg, Germany
Chosen: Sequence Reducible Holdout Loss for Language Model Pretraining
Similarity ratio: 0.3453
----
Query: Cb or not Cb? Centering theory applied to NLGRodger KibbleInformation Technology Research InstituteUniversity of BrightonLewes RoadBrighton BN2 4GJ
Chosen: Proceedings of the First Workshop on Language-driven D

In [30]:
import string


def normalize(s: str) -> str:
    return s.translate(str.maketrans("", "", string.punctuation + string.whitespace)).lower()


def front_half(s: str) -> str:
    n = len(s)
    return s[: n // 2]


def back_half(s: str) -> str:
    n = len(s)
    return s[n // 2 :]

In [31]:
from acl_anthology import Anthology

anthology = Anthology.from_repo()
norm_titlemap = {normalize(str(p.title)): p for p in anthology.papers()}

In [32]:
title = "How Creative is Your Writing? A Linguistic Creativity Measure fromComputer Science and Cognitive Psychology Perspectives"
print(f"Processing title: {title} (normed to: {normalize(title)})")


matches = difflib.get_close_matches(normalize(title), norm_titlemap.keys(), n=5, cutoff=0.45)
print(f"Fuzzy matches:")
for match in matches:
    print(f"Match: {norm_titlemap[match].title}")
print("----")

Processing title: How Creative is Your Writing? A Linguistic Creativity Measure fromComputer Science and Cognitive Psychology Perspectives (normed to: howcreativeisyourwritingalinguisticcreativitymeasurefromcomputerscienceandcognitivepsychologyperspectives)
Fuzzy matches:
Match: Austrian German – Linguistic, Normative and Political Perspectives
Match: Computational linguistics and mathematical logic from a computer science point of view
Match: Benchmarking Machine Reading Comprehension: A Psychological Perspective
Match: Predictive and Distinctive Linguistic Features in Schizophrenia-Bipolar Spectrum Disorders
Match: A Survey in Automatic Irony Processing: Linguistic, Cognitive, and Multi-X Perspectives
----


In [33]:
normed_title = normalize(title)
print(f"Normed title (len: {len(normed_title)}): {normed_title}")
normed_title_front = front_half(normed_title)
normed_title_back = back_half(normed_title)
print(f"Front: {normed_title_front}, Back: {normed_title_back}")

matches = [
    t
    for t in norm_titlemap.keys()
    if any(
        [
            normed_title_front in t,
            normed_title_back in t,
            front_half(t) in normed_title,
            back_half(t) in normed_title,
        ]
    )
]
print(f"Substring matches: {matches}")

Normed title (len: 105): howcreativeisyourwritingalinguisticcreativitymeasurefromcomputerscienceandcognitivepsychologyperspectives
Front: howcreativeisyourwritingalinguisticcreativitymeasure, Back: fromcomputerscienceandcognitivepsychologyperspectives
Substring matches: ['esprit', 'atlas', 'lmt', 'gms', 'emis', 'sdl', 'liwp', 'ttc', 'passives', 'ucb', 'linguisticcorpussearch', 'tinlap3', 'linguisticdistances', 'howcreativeisyourwriting', 'mice']


In [37]:
difflib.get_close_matches(normed_title, matches, n=5, cutoff=0.3)

['howcreativeisyourwriting']

In [None]:
actual_title = "Analysis of statistical and morphological classes to generate weightedreordering hypotheses on a Statistical Machine Translation system"
norm_actual_title = normalize(actual_title)
print(f"Actual title normed: {norm_actual_title}")
print(f"Normed title in matches: {norm_actual_title in norm_titlemap}")

In [None]:
difflib.get_close_matches(normed_title, norm_titlemap.keys(), n=3, cutoff=0.6)

In [None]:
difflib.get_close_matches("foobar", [], n=3, cutoff=0.8)

In [None]:
normed_title = normalize(title)
front_norm, back_norm = front_half(normed_title), back_half(normed_title)
for key in norm_titlemap:
    if front_norm in key or back_norm in key or front_half(key) in normed_title or back_half(key) in normed_title:
        print(f"Substring match found: {key} -> {norm_titlemap[key].title}")

In [None]:
import csv

with open("../data/datasets/acl200_global/context_dataset.csv", "r") as f:
    reader = csv.DictReader(f)
    records = [row for row in reader]

In [None]:
from pprint import pprint

for i, record in enumerate(records):
    if record["target_title"].startswith("}") or record["citing_title"].startswith("}"):
        print(f"Found malformed record {i}:")
        pprint(record)
        print(record["citing_abstract"])
        print("----")

In [None]:
normalize("Sentence Comparison using Robust Minimal Recursion Semantics and an Ontology") in norm_titlemap

In [None]:
str(norm_titlemap[normalize("Sentence Comparison using Robust Minimal Recursion Semantics and an Ontology")].title)

In [39]:
# Get paper by id
paper = anthology.get_paper("J09-4011")
print(paper)
print(paper.title)

Paper(id='11', bibkey='blunsom-2009-book', title=<MarkupText 'Book Review: Learning Machine Translation by Cyril Goutte, Nicola Cancedda, Marc Dymetman, and <span class="acl-fixed-case">G</span>eorge Foster (editors)'>, authors=[NameSpecification(name=Name(first='Phil', last='Blunsom'), id=None, orcid=None, affiliation=None, variants=[])], abstract=None)
Book Review: Learning Machine Translation by Cyril Goutte, Nicola Cancedda, Marc Dymetman, and George Foster (editors)


In [45]:
for paper in list(anthology.papers())[:5]:
    print(paper.abstract)
    

None
None
None
None
None


In [46]:
dir(paper)

['__annotations__',
 '__attrs_attrs__',
 '__attrs_own_setattr__',
 '__attrs_props__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_id',
 'abstract',
 'address',
 'attachments',
 'authors',
 'awards',
 'bibkey',
 'bibtype',
 'citeproc_dict',
 'collection_id',
 'csltype',
 'deletion',
 'doi',
 'editors',
 'errata',
 'from_frontmatter_xml',
 'from_xml',
 'full_id',
 'full_id_tuple',
 'get_editors',
 'get_events',
 'get_ingest_date',
 'get_issue',
 'get_journal_title',
 'id',
 'ingest_date',
 'is_deleted',
 'is_frontmatter',
 'issue',
 'journal',
 'language',
 'language_name',
 'month',
 'note