In [44]:
from acl_anthology import Anthology

anthology = Anthology.from_repo()  # pulls metadata from the official repo

title = "Utilizing Statistical Dialogue Act".lower()
# title = "Stylistic Variation in Multilingual Instructions".lower()

candidates = []
for paper in anthology.papers():
    if title in str(paper.title).lower():
        candidates.append((paper.id, str(paper.title), paper))

print(candidates)  # list of (ACL_ID, title)

[('16', 'Utilizing Statistical Dialogue Act Processing in Verbmobil', Paper(id='16', bibkey='reithinger-maier-1995-utilizing', title=<MarkupText 'Utilizing Statistical Dialogue Act Processing in <span class="acl-fixed-case">V</span>erbmobil'>, authors=[NameSpecification(name=Name(first='Norbert', last='Reithinger'), id=None, orcid=None, affiliation=None, variants=[]), NameSpecification(name=Name(first='Elisabeth', last='Maier'), id=None, orcid=None, affiliation=None, variants=[])], abstract=None))]


In [51]:
# Get by ACL id
p = anthology.get("W04-0842")
print(p.pdf.url)

https://aclanthology.org/W04-0842.pdf


In [4]:
title_to_paper = {str(p.title): p for p in anthology.papers()}

In [50]:
paper = title_to_paper["WSD system based on specialized Hidden Markov Model (upv-shmm-eaw)"]
print(paper.full_id)
print(dir(paper))

W04-0842
['__annotations__', '__attrs_attrs__', '__attrs_own_setattr__', '__attrs_props__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__match_args__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_check_id', 'abstract', 'address', 'attachments', 'authors', 'awards', 'bibkey', 'bibtype', 'citeproc_dict', 'collection_id', 'csltype', 'deletion', 'doi', 'editors', 'errata', 'from_frontmatter_xml', 'from_xml', 'full_id', 'full_id_tuple', 'get_editors', 'get_events', 'get_ingest_date', 'get_issue', 'get_journal_title', 'id', 'ingest_date', 'is_deleted', 'is_frontmatter', 'issue', 'journal', 'language', 'language_name', 'month', 'note', 'pages', 'paperswithcode', 'parent', 'pdf', 'publisher'

In [8]:
q = "system precision recall"
candidates = [title for title in title_to_paper if title.lower().startswith(q)]
for candidate in candidates:
    print(f"Title: {candidate}")


In [134]:
for title, paper in title_to_paper.items():
    authors = paper.authors
    for author in authors:
        if author.name.last.lower().startswith('cozman'):
            print(f"Author: {author.name.last}, {author.name.first}; Title: {title}")

Author: Cozman, Fabio; Title: DaMata: A Robot-Journalist Covering the Brazilian Amazon Deforestation
Author: Cozman, Fabio; Title: BLAB Reporter: Automated journalism covering the Blue Amazon
Author: Cozman, Fabio; Title: Imaginary Numbers! Evaluating Numerical Referring Expressions by Neural End-to-End Surface Realization Systems
Author: Cozman, Fabio; Title: A Simple Audio and Text Collection-Annotation Tool Targeted to Brazilian Indigenous Language Native Speakers


In [None]:
ORIGINAL: identifying english and hungarian light verb constructions:a contrastive approach, MATCH: identifying english and hungarian light verb constructions: a contrastive approach
original = "identifying english and hungarian light verb constructions:a contrastive approach"
actual = "identifying english and hungarian light verb constructions: a contrastive approach"

In [60]:
import difflib
matches = difflib.get_close_matches(
    "Generating Semantically Precise Scene Graphs from Textual Descriptionsfor Improved Image Retrieval",
    titles,
    n=3,
    cutoff=0.6,
)
print(matches)

['generating semantically precise scene graphs from textual descriptions for improved image retrieval']


In [6]:
print(candidates[0][1])
print(candidates[0][1].full_id)

Paper(id='6', bibkey='dymetman-etal-2012-optimization', title=<MarkupText 'Optimization and Sampling for <span class="acl-fixed-case">NLP</span> from a Unified Viewpoint'>, authors=[NameSpecification(name=Name(first='Marc', last='Dymetman'), id=None, orcid=None, affiliation=None, variants=[]), NameSpecification(name=Name(first='Guillaume', last='Bouchard'), id=None, orcid=None, affiliation=None, variants=[]), NameSpecification(name=Name(first='Simon', last='Carter'), id=None, orcid=None, affiliation=None, variants=[])], abstract=None)
W12-6106


In [35]:
paper.month

from datetime import datetime
dt = datetime.strptime(f"{paper.year} {paper.month}", "%Y %B")
print(f"{dt.year}{dt.month:02d}{dt.day:02d}")

19990201


In [12]:
import csv

# Find out if the set of citing_titles and target_titles are disjoint or if they have any intersection
citing_titles = set()
target_titles = set()
path_to_data = "../data/datasets/acl200_global/context_dataset.csv"
with open(path_to_data, "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        citing_titles.add(row["citing_title"].lower().strip())
        target_titles.add(row["target_title"].lower().strip())



In [30]:
print(f"Number of citing titles: {len(citing_titles)}")
print(f"Number of target titles: {len(target_titles)}")
intersection = citing_titles.intersection(target_titles)
union = citing_titles.union(target_titles)
print(f"Number of titles in union: {len(union)}")
print(f"Number of titles in intersection: {len(intersection)}")
print(f"Set difference citing titles that aren't also target titles: {len(citing_titles - target_titles)}")

Number of citing titles: 11786
Number of target titles: 5192
Number of titles in union: 13222
Number of titles in intersection: 3756
Set difference citing titles that aren't also target titles: 8030


In [36]:
train_titles = set()
path_to_data = "../data/datasets/acl200_global/context_dataset_train.csv"
with open(path_to_data, "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        train_titles.add(row["citing_title"].lower().strip())
        train_titles.add(row["target_title"].lower().strip())

eval_titles = set()
path_to_data = "../data/datasets/acl200_global/context_dataset_eval.csv"
with open(path_to_data, "r") as f:
    reader = csv.DictReader(f)
    for row in reader:
        eval_titles.add(row["citing_title"].lower().strip())
        eval_titles.add(row["target_title"].lower().strip())

train_eval_union = train_titles.union(eval_titles)
print(f"Number of titles in train/eval union: {len(train_eval_union)}")

Number of titles in train/eval union: 13222


In [39]:
paper.pdf.url

'https://aclanthology.org/Y99-1036.pdf'

In [40]:
paper.doi

'http://hdl.handle.net/2065/12135'

In [66]:
titles = [str(p.title).lower() for p in anthology.papers()]
print(len(titles))
print(len(set(titles)))

118474
117400


In [68]:
dupes = {}
for paper in anthology.papers():
    title = str(paper.title).lower()
    if title not in dupes:
        dupes[title] = paper
    else:
        print(f"Duplicate title found: {title} -> {dupes[title].full_id}, {paper.full_id}")

Duplicate title found: discussion -> 1957.earlymt-1.8, 1957.earlymt-1.16
Duplicate title found: discussion -> 1957.earlymt-1.8, 1957.earlymt-1.22
Duplicate title found: summation by chairman -> 1960.earlymt-nsmt.8, 1960.earlymt-nsmt.15
Duplicate title found: research in machine translation -> 1957.earlymt-1.0, 1960.earlymt-nsmt.18
Duplicate title found: summation by chairman -> 1960.earlymt-nsmt.8, 1960.earlymt-nsmt.21
Duplicate title found: introduction -> 1957.earlymt-1.1, 1960.earlymt-nsmt.22
Duplicate title found: summation by chairman -> 1960.earlymt-nsmt.8, 1960.earlymt-nsmt.25
Duplicate title found: introduction -> 1957.earlymt-1.1, 1960.earlymt-nsmt.30
Duplicate title found: introduction -> 1957.earlymt-1.1, 1960.earlymt-nsmt.36
Duplicate title found: introduction -> 1957.earlymt-1.1, 1960.earlymt-nsmt.42
Duplicate title found: introduction -> 1957.earlymt-1.1, 1960.earlymt-nsmt.47
Duplicate title found: introduction -> 1957.earlymt-1.1, 1960.earlymt-nsmt.53
Duplicate title fou

In [88]:
title = "A Computational Lexicography of Multi-Word Units: How Efficient Can It Be?".lower()
candidates = {}
for paper in anthology.papers():
    if title in str(paper.title).lower():
        candidates[str(paper.title)] = paper
for c in candidates:
    print(c)

In [84]:
difflib.get_close_matches("Bilingual Correspondence Recursive Autoencodersfor Statistical Machine Translation".lower(),
candidates.keys(), n=5, cutoff=0.8)

['Bilingual Correspondence Recursive Autoencoder for Statistical Machine Translation']