Skip to content

Commit

Permalink
Fix URL parse for Cord19Generator (#1221)
Browse files Browse the repository at this point in the history
  • Loading branch information
edwinzhng committed May 25, 2020
1 parent 355a33b commit 1de3274
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 2 deletions.
4 changes: 4 additions & 0 deletions docs/experiments-cord19.md
Expand Up @@ -234,3 +234,7 @@ All versions of pre-built indexes:
| 2020-03-20 | Full-Text | 2.6G | [[Dropbox]](https://www.dropbox.com/s/w74nmpmvdgw7o00/lucene-index-covid-full-text-2020-03-20.tar.gz) | `30cae90b85fa8f1b53acaa62413756e3`
| 2020-03-20 | Paragraph | 2.9G | [[Dropbox]](https://www.dropbox.com/s/evnhj2ylo02m03f/lucene-index-covid-paragraph-2020-03-20.tar.gz) | `4c78e9ede690dbfac13e25e634c70ae4`

## Known Issues

2020-05-19
- Missing URLs for several articles due to a known issue with the CORD-19 dataset release
Expand Up @@ -53,10 +53,11 @@ public enum CovidField {
JOURNAL("journal"),
PUBLISH_TIME("publish_time"),
YEAR("year"),
LICENSE("license"),
PMC_ID("pmcid"),
PUBMED_ID("pubmed_id"),
LICENSE("license"),
MICROSOFT_ID("mag_id"),
S2_ID("s2_id"),
WHO("who_covidence_id"),
URL("url");

Expand Down Expand Up @@ -145,12 +146,15 @@ public Document createDocument(Cord19BaseDocument covidDoc) throws GeneratorExce
covidDoc.record().get(CovidField.PUBMED_ID.name), Field.Store.YES));
doc.add(new StringField(CovidField.MICROSOFT_ID.name,
covidDoc.record().get(CovidField.MICROSOFT_ID.name), Field.Store.YES));
doc.add(new StringField(CovidField.S2_ID.name,
covidDoc.record().get(CovidField.S2_ID.name), Field.Store.YES));
doc.add(new StringField(CovidField.PUBLISH_TIME.name,
covidDoc.record().get(CovidField.PUBLISH_TIME.name), Field.Store.YES));
doc.add(new StringField(CovidField.LICENSE.name,
covidDoc.record().get(CovidField.LICENSE.name), Field.Store.YES));
// default to first URL in metadata
doc.add(new StringField(CovidField.URL.name,
covidDoc.record().get(CovidField.URL.name), Field.Store.YES));
covidDoc.record().get(CovidField.URL.name).split("; ")[0], Field.Store.YES));

if (covidDoc instanceof TrialstreamerCollection.Document) {
TrialstreamerCollection.Document tsDoc = (TrialstreamerCollection.Document) covidDoc;
Expand Down

0 comments on commit 1de3274

Please sign in to comment.