# ACL PyPI Package

In [1]:
from acl_anthology import Anthology
anthology = Anthology.from_repo()

In [7]:
paper = anthology.get("D10-1001")

# Basic Dataset Exploration

### ACL Anthology 
- [Github](https://github.com/acl-org/acl-anthology/tree/master)

In [8]:
title = str(paper.title)
authors = [author.name for author in paper.authors]
print(f'Title: {title}\nAuthors: {authors}')

Title: On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing
Authors: [Name(first='Alexander M.', last='Rush'), Name(first='David', last='Sontag'), Name(first='Michael', last='Collins'), Name(first='Tommi', last='Jaakkola')]


In [9]:
paper.citeproc_dict

{'id': 'rush-etal-2010-dual',
 'title': 'On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing',
 'type': 'paper-conference',
 'author': [{'family': 'Rush', 'given': 'Alexander M.'},
  {'family': 'Sontag', 'given': 'David'},
  {'family': 'Collins', 'given': 'Michael'},
  {'family': 'Jaakkola', 'given': 'Tommi'}],
 'editor': [{'family': 'Li', 'given': 'Hang'},
  {'family': 'Màrquez', 'given': 'Lluís'}],
 'publisher': 'Association for Computational Linguistics',
 'publisher-place': 'Cambridge, MA',
 'issued': {'date-parts': [['2010']]},
 'URL': 'https://aclanthology.org/D10-1001/',
 'page': '1–11',
 'container-title': 'Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing'}

### ACL Anthology Corpus 
- [Github](https://github.com/shauryr/ACL-anthology-corpus), [HuggingFace](https://huggingface.co/datasets/WINGNUS/ACL-OCL)
- AAC_path: the 489MB Dataframe version on Github

In [None]:
import pandas as pd
import pyarrow
# Dataframe with extracted metadata (table below with details) and full text of the collection for analysis : size 489M
AAC_path = "data/acl-publication-info.74k.parquet"

df = pd.read_parquet(AAC_path, engine='pyarrow')
df.head()

Basic Info for dataset

In [None]:
# Data shape
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")
print('\n------------------\n')
# Column names + data types
print(df.info())
print('\n------------------\n')

# Descriptive statistics for numeric columns
print(df.describe())
print('\n------------------\n')

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:", missing_values)
print('\n------------------\n')

# Check for duplicate entries
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
print('\n------------------\n')


Line Plot: Year vs Publication Counts

In [4]:
import matplotlib.pyplot as plt

yearly_publications = df['year'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
plt.plot(yearly_publications.index, yearly_publications.values, marker='o')
plt.xlabel('Year')
plt.ylabel('Number of Publications')
plt.title('Publication Trend Over Time')
plt.xticks(rotation=60)
plt.grid(True)
plt.show()

Full_text column: this would probably be an essential part where we derive our citation network.

In [None]:
sample_full_text = df.loc[0, 'full_text']
sample = sample_full_text.lower().strip()