Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issues when indexing AclAnthology #2084

Merged
merged 9 commits into from
Apr 22, 2023
7 changes: 7 additions & 0 deletions docs/acl-anthology.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ mkdir -p build/data

Generate cleaned YAML data:

1. Add the following lines to `bin/create_hugo_yaml.py` before function `export_anthology`
```python
# Prevent yaml from creating aliases which can't be parsed by anserini
Dumper.ignore_aliases = lambda self, data: True
```

2. Execute the following script:
```bash
python bin/create_hugo_yaml.py
```
Expand Down
21 changes: 17 additions & 4 deletions src/main/java/io/anserini/collection/AclAnthology.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import org.yaml.snakeyaml.LoaderOptions;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

Expand Down Expand Up @@ -53,11 +54,17 @@ public AclAnthology(Path path) {
this.path = Paths.get(path.toString(), "/papers"); // Path containing files to iterate
this.allowedFileSuffix = Set.of(".yaml");

ObjectMapper mapper = new ObjectMapper(new YAMLFactory());
LoaderOptions loaderOptions = new LoaderOptions();
loaderOptions.setCodePointLimit(10 * 1024 * 1024); // 10 MB
YAMLFactory yamlFactory = YAMLFactory.builder()
.loaderOptions(loaderOptions)
.build();

ObjectMapper mapper = new ObjectMapper(yamlFactory);
try {
this.volumes = mapper.readValue(new File(path.toString(), "/volumes.yaml"), JsonNode.class);
} catch (IOException e) {
LOG.error("Unable to open volumes.yaml");
LOG.error(e);
return;
}
}
Expand Down Expand Up @@ -86,7 +93,13 @@ public Segment(Path path) throws IOException {

// read YAML file into JsonNode format
bufferedReader = new BufferedReader(new FileReader(path.toString()));
ObjectMapper mapper = new ObjectMapper(new YAMLFactory());
LoaderOptions loaderOptions = new LoaderOptions();
loaderOptions.setCodePointLimit(10 * 1024 * 1024); // 10 MB
YAMLFactory yamlFactory = YAMLFactory.builder()
.loaderOptions(loaderOptions)
.build();

ObjectMapper mapper = new ObjectMapper(yamlFactory);
MappingIterator<JsonNode> iterator = mapper.readerFor(JsonNode.class).readValues(bufferedReader);

if (iterator.hasNext()) {
Expand Down Expand Up @@ -155,7 +168,7 @@ public Document(Map.Entry<String, JsonNode> jsonEntry) {

// Process venue facets
venues = new ArrayList<>();
ArrayNode venuesNode = (ArrayNode) volume.get("venues");
ArrayNode venuesNode = (ArrayNode) paper.get("venue");
venuesNode.elements().forEachRemaining(node -> venues.add(node.asText()));

// Process SIG facets
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,13 @@ private enum AclAnthologyField {
AclAnthologyField.THUMBNAIL.name);

public static final List<String> NUMERIC_FIELD_NAMES = List.of(
AclAnthologyField.YEAR.name,
AclAnthologyField.PAGE_FIRST.name,
AclAnthologyField.PAGE_LAST.name);
AclAnthologyField.YEAR.name);

public static final List<String> FIELDS_WITHOUT_STEMMING = List.of(
AclAnthologyField.AUTHOR_STRING.name,
AclAnthologyField.PUBLISHER.name,
AclAnthologyField.PAGE_FIRST.name,
AclAnthologyField.PAGE_LAST.name,
AclAnthologyField.MONTH.name);

public AclAnthologyGenerator(IndexCollection.Args args) {
Expand Down
18 changes: 9 additions & 9 deletions src/test/java/io/anserini/collection/AclAnthologyTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,13 @@ public void setUp() throws Exception {
doc1.put("booktitle", "COLING 2000 Volume 1: The 18th International Conference on Computational Linguistics");
doc1.put("paper_id", "3");
doc1.put("parent_volume_id", "C00-1");
doc1.put("pdf", "https://www.aclweb.org/anthology/C00-1003.pdf");
doc1.put("thumbnail", "https://www.aclweb.org/anthology/thumb/C00-1003.jpg");
doc1.put("pdf", "https://aclanthology.org/C00-1003.pdf");
doc1.put("thumbnail", "https://aclanthology.org/thumb/C00-1003.jpg");
doc1.put("title", "Selectional Restrictions in HPSG");
doc1.put("url", "https://www.aclweb.org/anthology/C00-1003");
doc1.put("url", "https://aclanthology.org/C00-1003");
doc1.put("contents", "Selectional Restrictions in HPSG ");
doc1.put("sigs", "");
doc1.put("venues", "COLING");
doc1.put("venues", "coling");
expected.put("C00-1003", doc1);

HashMap<String, String> doc2 = new HashMap<>();
Expand All @@ -67,7 +67,7 @@ public void setUp() throws Exception {
doc2.put("title", "Exploiting a Probabilistic Hierarchical Model for Generation");
doc2.put("contents", "Exploiting a Probabilistic Hierarchical Model for Generation ");
doc2.put("sigs", "");
doc2.put("venues", "COLING");
doc2.put("venues", "coling");
expected.put("C00-1007", doc2);

HashMap<String, String> doc3 = new HashMap<>();
Expand All @@ -85,11 +85,11 @@ public void setUp() throws Exception {
doc3.put("page_last", "34");
doc3.put("paper_id", "3");
doc3.put("parent_volume_id", "E17-1");
doc3.put("pdf", "https://www.aclweb.org/anthology/E17-1003.pdf");
doc3.put("pdf", "https://aclanthology.org/E17-1003.pdf");
doc3.put("publisher", "Association for Computational Linguistics");
doc3.put("thumbnail", "https://www.aclweb.org/anthology/thumb/E17-1003.jpg");
doc3.put("thumbnail", "https://aclanthology.org/thumb/E17-1003.jpg");
doc3.put("title", "Exploring Different Dimensions of Attention for Uncertainty Detection");
doc3.put("url", "https://www.aclweb.org/anthology/E17-1003");
doc3.put("url", "https://aclanthology.org/E17-1003");
doc3.put("contents", "Exploring Different Dimensions of Attention for Uncertainty Detection " +
"Neural networks with attention have proven effective for many natural " +
"language processing tasks. In this paper, we develop attention mechanisms for " +
Expand All @@ -102,7 +102,7 @@ public void setUp() throws Exception {
"perform similar to the state-of-the-art model on a biomedical benchmark which " +
"uses a large set of linguistic features.");
doc3.put("sigs", "");
doc3.put("venues", "EACL");
doc3.put("venues", "eacl");
expected.put("E17-1003", doc3);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ protected void setCheckIndexGroundTruth() {
"raw",
"Exploiting a Probabilistic Hierarchical Model for Generation "));

fieldNormStatusTotalFields = 13;
termIndexStatusTermCount = 241;
termIndexStatusTotFreq = 288;
fieldNormStatusTotalFields = 21;
termIndexStatusTermCount = 330;
termIndexStatusTotFreq = 411;
storedFieldStatusTotalDocCounts = 3;
termIndexStatusTotPos = 339;
storedFieldStatusTotFields = 67;
termIndexStatusTotPos = 470;
storedFieldStatusTotFields = 83;
}

@Override
Expand Down
30 changes: 24 additions & 6 deletions src/test/resources/sample_docs/acl/papers/segment1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,22 @@ C00-1003:
Linguistics'
booktitle_html: '<span class="acl-fixed-case">COLING</span> 2000 Volume 1: The 18th
International Conference on Computational Linguistics'
citation: '[Selectional Restrictions in HPSG](https://aclanthology.org/C00-1003)
(Androutsopoulos & Dale, COLING 2000)'
citation_acl: 'Ion Androutsopoulos and Robert Dale. 2000. <a href="https://aclanthology.org/C00-1003">Selectional
Restrictions in HPSG</a>. In <i>COLING 2000 Volume 1: The 18th International Conference
on Computational Linguistics</i>.'
events: []
language: null
paper_id: '3'
parent_volume_id: C00-1
pdf: https://www.aclweb.org/anthology/C00-1003.pdf
thumbnail: https://www.aclweb.org/anthology/thumb/C00-1003.jpg
pdf: https://aclanthology.org/C00-1003.pdf
thumbnail: https://aclanthology.org/thumb/C00-1003.jpg
title: Selectional Restrictions in HPSG
title_html: Selectional Restrictions in <span class="acl-fixed-case">HPSG</span>
url: https://www.aclweb.org/anthology/C00-1003
url: https://aclanthology.org/C00-1003
venue:
- coling
year: '2000'
C00-1007:
author:
Expand All @@ -40,11 +49,20 @@ C00-1007:
Linguistics'
booktitle_html: '<span class="acl-fixed-case">COLING</span> 2000 Volume 1: The 18th
International Conference on Computational Linguistics'
citation: '[Exploiting a Probabilistic Hierarchical Model for Generation](https://aclanthology.org/C00-1007)
(Bangalore & Rambow, COLING 2000)'
citation_acl: 'Srinivas Bangalore and Owen Rambow. 2000. <a href="https://aclanthology.org/C00-1007">Exploiting
a Probabilistic Hierarchical Model for Generation</a>. In <i>COLING 2000 Volume
1: The 18th International Conference on Computational Linguistics</i>.'
events: []
language: null
paper_id: '7'
parent_volume_id: C00-1
pdf: https://www.aclweb.org/anthology/C00-1007.pdf
thumbnail: https://www.aclweb.org/anthology/thumb/C00-1007.jpg
pdf: https://aclanthology.org/C00-1007.pdf
thumbnail: https://aclanthology.org/thumb/C00-1007.jpg
title: Exploiting a Probabilistic Hierarchical Model for Generation
title_html: Exploiting a Probabilistic Hierarchical Model for Generation
url: https://www.aclweb.org/anthology/C00-1007
url: https://aclanthology.org/C00-1007
venue:
- coling
year: '2000'
20 changes: 17 additions & 3 deletions src/test/resources/sample_docs/acl/papers/segment2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,30 @@ E17-1003:
for Computational Linguistics: Volume 1, Long Papers'
booktitle_html: 'Proceedings of the 15th Conference of the <span class="acl-fixed-case">E</span>uropean
Chapter of the Association for Computational Linguistics: Volume 1, Long Papers'
citation: "[Exploring Different Dimensions of Attention for Uncertainty Detection](https://aclanthology.org/E17-1003)
(Adel & Sch\xFCtze, EACL 2017)"
citation_acl: "Heike Adel and Hinrich Sch\xFCtze. 2017. <a href=\"https://aclanthology.org/E17-1003\">Exploring
Different Dimensions of Attention for Uncertainty Detection</a>. In <i>Proceedings
of the 15th Conference of the European Chapter of the Association for Computational
Linguistics: Volume 1, Long Papers</i>, pages 22\u201334, Valencia, Spain. Association
for Computational Linguistics."
events: []
language: null
month: April
page_first: '22'
page_last: '34'
pages: "22\u201334"
paper_id: '3'
parent_volume_id: E17-1
pdf: https://www.aclweb.org/anthology/E17-1003.pdf
pdf: https://aclanthology.org/E17-1003.pdf
publisher: Association for Computational Linguistics
thumbnail: https://www.aclweb.org/anthology/thumb/E17-1003.jpg
pwcdataset:
- name: SST
url: https://paperswithcode.com/dataset/sst
thumbnail: https://aclanthology.org/thumb/E17-1003.jpg
title: Exploring Different Dimensions of Attention for Uncertainty Detection
title_html: Exploring Different Dimensions of Attention for Uncertainty Detection
url: https://www.aclweb.org/anthology/E17-1003
url: https://aclanthology.org/E17-1003
venue:
- eacl
year: '2017'
17 changes: 12 additions & 5 deletions src/test/resources/sample_docs/acl/volumes.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
C00-1:
events: []
has_abstracts: false
meta_date: '2000'
papers:
Expand All @@ -9,8 +10,11 @@ C00-1:
Linguistics'
title_html: '<span class="acl-fixed-case">COLING</span> 2000 Volume 1: The 18th
International Conference on Computational Linguistics'
url: https://aclanthology.org/C00-1
venue:
- coling
venues:
- COLING
- coling
year: '2000'
E17-1:
address: Valencia, Spain
Expand All @@ -27,19 +31,22 @@ E17-1:
full: Alexander Koller
id: alexander-koller
last: Koller
events: []
has_abstracts: true
meta_date: 2017/4
month: April
papers:
- E17-1003
pdf: https://www.aclweb.org/anthology/E17-1.pdf
- E17-1000
pdf: https://aclanthology.org/E17-1.pdf
publisher: Association for Computational Linguistics
sigs: []
title: 'Proceedings of the 15th Conference of the European Chapter of the Association
for Computational Linguistics: Volume 1, Long Papers'
title_html: 'Proceedings of the 15th Conference of the <span class="acl-fixed-case">E</span>uropean
Chapter of the Association for Computational Linguistics: Volume 1, Long Papers'
url: https://www.aclweb.org/anthology/E17-1
url: https://aclanthology.org/E17-1
venue:
- eacl
venues:
- EACL
- eacl
year: '2017'
Loading