From 5b10339b55ea04e097a960fd722e8553573daccf Mon Sep 17 00:00:00 2001 From: nisalanirmana Date: Mon, 22 Jun 2015 00:09:17 +0530 Subject: [PATCH 01/10] added HAdr and HGeo Extractors support --- .../java/org/apache/any23/vocab/VCard.java | 5 + .../html/microformats2/HAdrExtractor.java | 120 ++++++++++++++++++ .../microformats2/HAdrExtractorFactory.java | 57 +++++++++ .../html/microformats2/HGeoExtractor.java | 84 ++++++++++++ .../microformats2/HGeoExtractorFactory.java | 57 +++++++++ .../microformats2/annotations/Includes.java | 41 ++++++ .../annotations/package-info.java | 24 ++++ .../html/microformats2/package-info.java | 24 ++++ .../html/microformats2/example-mf2-h-adr.html | 27 ++++ .../html/microformats2/example-mf2-h-geo.html | 22 ++++ .../html/microformats2/HAdrExtractorTest.java | 37 ++++++ .../html/microformats2/HGeoExtractorTest.java | 47 +++++++ .../any23/vocab/RDFSchemaUtilsTest.java | 4 +- .../microformats2/h-adr/h-adr-test.html | 33 +++++ .../microformats2/h-geo/h-geo-test.html | 33 +++++ 15 files changed, 613 insertions(+), 2 deletions(-) create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorFactory.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorFactory.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/Includes.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/package-info.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/package-info.java create mode 100644 core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-adr.html create mode 100644 core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-geo.html create mode 100644 core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java create mode 100644 core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java create mode 100644 test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html create mode 100644 test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html diff --git a/api/src/main/java/org/apache/any23/vocab/VCard.java b/api/src/main/java/org/apache/any23/vocab/VCard.java index f43c5eb48..10d3c948c 100644 --- a/api/src/main/java/org/apache/any23/vocab/VCard.java +++ b/api/src/main/java/org/apache/any23/vocab/VCard.java @@ -58,6 +58,11 @@ public static VCard getInstance() { */ public final URI agent = createProperty("agent"); + /** + * The altitude of a geographic location. + */ + public final URI altitude = createProperty("altitude"); + /** * The birthday of a person. */ diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java new file mode 100644 index 000000000..022bf4712 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.TagSoupExtractionResult; +import org.apache.any23.extractor.html.microformats2.annotations.Includes; +import org.apache.any23.vocab.VCard; +import org.openrdf.model.BNode; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; + +/** + * Extractor for the h-adr + * microformat. + * + * @author Nisala Nirmana + */ +@Includes( extractors = HGeoExtractor.class ) +public class HAdrExtractor extends EntityBasedMicroformatExtractor { + + private static final VCard vVCARD = VCard.getInstance(); + + private static final String[] addressFields = { + "p-street-address", + "p-extended-address", + "p-locality", + "p-region", + "p-postal-code", + "p-country-name", + "p-geo" + }; + + protected String getBaseClassName() { + return "h-adr"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + protected boolean extractEntity(Node node, ExtractionResult out) { + if (null == node) return false; + final HTMLDocument document = new HTMLDocument(node); + BNode adr = getBlankNodeFor(node); + out.writeTriple(adr, RDF.TYPE, vVCARD.Address); + final String extractorName = getDescription().getExtractorName(); + for (String field : addressFields) { + HTMLDocument.TextField[] values = document.getPluralTextField(field); + for (HTMLDocument.TextField val : values) { + if(!field.equals("p-geo")) { + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.getProperty(field.replaceFirst("p-", "")), val.value() + ); + }else { + String[] composed = val.value().split(";"); + if (composed.length == 3){ + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.latitude, composed[0] + ); + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.longitude, composed[1] + ); + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.altitude, composed[2] + ); + + }else if (composed.length == 2){ + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.latitude, composed[0] + ); + conditionallyAddStringProperty( + val.source(), + adr, vVCARD.longitude, composed[1] + ); + }else { + //we discard if only length is 1 + } + + } + + } + } + + final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult(); + tser.addResourceRoot( document.getPathToLocalRoot(), adr, this.getClass() ); + + return true; + } + + @Override + public ExtractorDescription getDescription() { + return HAdrExtractorFactory.getDescriptionInstance(); + } + +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorFactory.java new file mode 100644 index 000000000..3b28fb54d --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HAdrExtractorFactory extends SimpleExtractorFactory implements + ExtractorFactory { + + public static final String NAME = "html-mf2-h-adr"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "vcard"); + + private static final ExtractorDescription descriptionInstance = new HAdrExtractorFactory(); + + public HAdrExtractorFactory() { + super( + HAdrExtractorFactory.NAME, + HAdrExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-adr.html"); + } + + @Override + public HAdrExtractor createExtractor() { + return new HAdrExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java new file mode 100644 index 000000000..4a1fbfd9a --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractor.java @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.TagSoupExtractionResult; +import org.apache.any23.vocab.VCard; +import org.openrdf.model.BNode; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; +/** + * Extractor for the h-geo + * microformat. + * + * @author Nisala Nirmana + */ +public class HGeoExtractor extends EntityBasedMicroformatExtractor { + + private static final VCard vVCARD = VCard.getInstance(); + + @Override + public ExtractorDescription getDescription() { + return HGeoExtractorFactory.getDescriptionInstance(); + } + + protected String getBaseClassName() { + return "h-geo"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + protected boolean extractEntity(Node node, ExtractionResult out) { + if (null == node) return false; + final HTMLDocument document = new HTMLDocument(node); + HTMLDocument.TextField latNode = document.getSingularTextField("p-latitude"); + HTMLDocument.TextField lonNode = document.getSingularTextField("p-longitude"); + HTMLDocument.TextField altNode = document.getSingularTextField("p-altitude"); + String lat = latNode.value(); + String lon = lonNode.value(); + String alt = altNode.value(); + BNode geo = getBlankNodeFor(node); + out.writeTriple(geo, RDF.TYPE, vVCARD.Location); + final String extractorName = getDescription().getExtractorName(); + conditionallyAddStringProperty( + latNode.source(), + geo, vVCARD.latitude , lat + ); + conditionallyAddStringProperty( + lonNode.source(), + geo, vVCARD.longitude, lon + ); + conditionallyAddStringProperty( + altNode.source(), + geo, vVCARD.altitude, alt + ); + + final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult(); + tser.addResourceRoot( document.getPathToLocalRoot(), geo, this.getClass() ); + + return true; + } + +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorFactory.java new file mode 100644 index 000000000..5b60b7dc2 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HGeoExtractorFactory extends SimpleExtractorFactory implements + ExtractorFactory { + + public static final String NAME = "html-mf2-h-geo"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "vcard"); + + private static final ExtractorDescription descriptionInstance = new HGeoExtractorFactory(); + + public HGeoExtractorFactory() { + super( + HGeoExtractorFactory.NAME, + HGeoExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-geo.html"); + } + + @Override + public HGeoExtractor createExtractor() { + return new HGeoExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/Includes.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/Includes.java new file mode 100644 index 000000000..ff9d738cc --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/Includes.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2.annotations; + +import org.apache.any23.extractor.html.MicroformatExtractor; + +import java.lang.annotation.Documented; +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +/** + * This annotation could be used to decorate a {@link MicroformatExtractor} to + * represent which of the other Microformats could it nest. + * + * @author Davide Palmisano ( dpalmisano@gmail.com ) + */ +@Documented +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface Includes { + + Class[] extractors(); + +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/package-info.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/package-info.java new file mode 100644 index 000000000..3311c9801 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/annotations/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This package contains the annotations needed to describe the + * single nesting relations among different Microformats. + * + * @see org.apache.any23.extractor.html.MicroformatExtractor + */ +package org.apache.any23.extractor.microformats2.annotations; \ No newline at end of file diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/package-info.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/package-info.java new file mode 100644 index 000000000..b9613738d --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/package-info.java @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + * All the various {@link org.apache.any23.extractor.Extractor} needed to distill RDF + * from Microformats in HTML pages are contained in this package. + * + */ +package org.apache.any23.extractor.html.microformats2; \ No newline at end of file diff --git a/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-adr.html b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-adr.html new file mode 100644 index 000000000..d6f2c061b --- /dev/null +++ b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-adr.html @@ -0,0 +1,27 @@ + + +
+ 349/B + Batagama,North + Jaela + Western + 11325 + SL + 349/B,Jaela + 51.526421;-0.081067;25 +
diff --git a/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-geo.html b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-geo.html new file mode 100644 index 000000000..c8b2607eb --- /dev/null +++ b/core/src/main/resources/org/apache/any23/extractor/html/microformats2/example-mf2-h-geo.html @@ -0,0 +1,22 @@ + + +
+ Latitude
7.066622
+ Longitude
79.903048
+ Altitude
15
+
diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java new file mode 100644 index 000000000..0fb362562 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HAdrExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory getExtractorFactory() { + return new HAdrExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { + assertExtract("/microformats2/h-adr/h-adr-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 10); + } +} diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java new file mode 100644 index 000000000..eba89de9d --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.apache.any23.extractor.html.microformats2.HGeoExtractorFactory; +import org.apache.any23.vocab.VCard; +import org.junit.Assert; +import org.junit.Test; +import org.openrdf.model.Resource; +import org.openrdf.model.vocabulary.RDF; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +import java.util.List; + + +public class HGeoExtractorTest extends AbstractExtractorTestCase { + + protected ExtractorFactory getExtractorFactory() { + return new HGeoExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { + assertExtract("/microformats2/h-geo/h-geo-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 4); + } + +} diff --git a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java index 39711912d..b4f8b7a5c 100644 --- a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java +++ b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java @@ -43,7 +43,7 @@ public class RDFSchemaUtilsTest { */ @Test public void testSerializeVocabulariesNTriples() { - serializeVocabularies(RDFFormat.NTRIPLES, 1918); + serializeVocabularies(RDFFormat.NTRIPLES, 1920); } /** @@ -53,7 +53,7 @@ public void testSerializeVocabulariesNTriples() { */ @Test public void testSerializeVocabulariesRDFXML() { - serializeVocabularies(RDFFormat.RDFXML, 4987); // Effective lines + separators. + serializeVocabularies(RDFFormat.RDFXML, 4992); // Effective lines + separators. } private void serializeVocabularies(RDFFormat format, int expectedLines) { diff --git a/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html b/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html new file mode 100644 index 000000000..b5c095a85 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html @@ -0,0 +1,33 @@ + + + + + + +
+ 349/B + Batagama,North + Jaela + Western + 11325 + SL + 51.526421;-0.081067;25 +
+ + + diff --git a/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html b/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html new file mode 100644 index 000000000..c0181fe26 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html @@ -0,0 +1,33 @@ + + + + + + + +

+ + 54.155278, + -2.249722 + 694 + +

+ + + + From ff816027510f731f3e3f6a3c410feb5c48ffd972 Mon Sep 17 00:00:00 2001 From: Nisala Nirmana Date: Sun, 28 Jun 2015 22:33:29 +0530 Subject: [PATCH 02/10] changes according to mentor michele feedback --- .../html/microformats2/HAdrExtractor.java | 69 +++++++++---------- .../html/microformats2/HGeoExtractor.java | 52 ++++++++------ .../microformats2/Microformats2Prefixes.java | 26 +++++++ .../html/microformats2/HAdrExtractorTest.java | 2 +- .../html/microformats2/HGeoExtractorTest.java | 2 +- .../microformats2/h-adr/h-adr-test.html | 21 +++--- .../microformats2/h-geo/h-geo-test.html | 8 +-- 7 files changed, 105 insertions(+), 75 deletions(-) create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java index 022bf4712..d0d9257d3 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HAdrExtractor.java @@ -17,12 +17,14 @@ package org.apache.any23.extractor.html.microformats2; +import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.TagSoupExtractionResult; import org.apache.any23.extractor.html.microformats2.annotations.Includes; import org.apache.any23.vocab.VCard; import org.openrdf.model.BNode; +import org.openrdf.model.Resource; import org.openrdf.model.vocabulary.RDF; import org.w3c.dom.Node; import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; @@ -40,17 +42,23 @@ public class HAdrExtractor extends EntityBasedMicroformatExtractor { private static final VCard vVCARD = VCard.getInstance(); private static final String[] addressFields = { - "p-street-address", - "p-extended-address", - "p-locality", - "p-region", - "p-postal-code", - "p-country-name", - "p-geo" + "street-address", + "extended-address", + "locality", + "region", + "postal-code", + "country-name", + "geo" + }; + + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" }; protected String getBaseClassName() { - return "h-adr"; + return Microformats2Prefixes.CLASS_PREFIX+"adr"; } @Override @@ -58,60 +66,45 @@ protected void resetExtractor() { // Empty. } - protected boolean extractEntity(Node node, ExtractionResult out) { + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { if (null == node) return false; final HTMLDocument document = new HTMLDocument(node); BNode adr = getBlankNodeFor(node); out.writeTriple(adr, RDF.TYPE, vVCARD.Address); final String extractorName = getDescription().getExtractorName(); for (String field : addressFields) { - HTMLDocument.TextField[] values = document.getPluralTextField(field); + HTMLDocument.TextField[] values = document.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); for (HTMLDocument.TextField val : values) { - if(!field.equals("p-geo")) { + if(!field.equals("geo")) { conditionallyAddStringProperty( val.source(), - adr, vVCARD.getProperty(field.replaceFirst("p-", "")), val.value() + adr, vVCARD.getProperty(field), val.value() ); }else { String[] composed = val.value().split(";"); - if (composed.length == 3){ - conditionallyAddStringProperty( - val.source(), - adr, vVCARD.latitude, composed[0] - ); + for(int counter=0;counterh-geo * microformat. @@ -36,13 +39,19 @@ public class HGeoExtractor extends EntityBasedMicroformatExtractor { private static final VCard vVCARD = VCard.getInstance(); + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" + }; + @Override public ExtractorDescription getDescription() { return HGeoExtractorFactory.getDescriptionInstance(); } protected String getBaseClassName() { - return "h-geo"; + return Microformats2Prefixes.CLASS_PREFIX+"geo"; } @Override @@ -53,31 +62,32 @@ protected void resetExtractor() { protected boolean extractEntity(Node node, ExtractionResult out) { if (null == node) return false; final HTMLDocument document = new HTMLDocument(node); - HTMLDocument.TextField latNode = document.getSingularTextField("p-latitude"); - HTMLDocument.TextField lonNode = document.getSingularTextField("p-longitude"); - HTMLDocument.TextField altNode = document.getSingularTextField("p-altitude"); - String lat = latNode.value(); - String lon = lonNode.value(); - String alt = altNode.value(); BNode geo = getBlankNodeFor(node); out.writeTriple(geo, RDF.TYPE, vVCARD.Location); final String extractorName = getDescription().getExtractorName(); - conditionallyAddStringProperty( - latNode.source(), - geo, vVCARD.latitude , lat - ); - conditionallyAddStringProperty( - lonNode.source(), - geo, vVCARD.longitude, lon - ); - conditionallyAddStringProperty( - altNode.source(), - geo, vVCARD.altitude, alt - ); - + ArrayList geoNodes = new ArrayList(); + for(String field : geoFields){ + geoNodes.add(document.getSingularTextField(Microformats2Prefixes.PROPERTY_PREFIX+field)); + } + if(geoNodes.get(0).source()==null){ + String[] composed = document.getSingularUrlField(Microformats2Prefixes.CLASS_PREFIX +"geo") + .value().split(";"); + for(int counter=0;counter getExtractorFactory() { public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { assertExtract("/microformats2/h-adr/h-adr-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 10); + assertStatementsSize(null, null, null, 11); } } diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java index eba89de9d..0d29fdaa8 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HGeoExtractorTest.java @@ -41,7 +41,7 @@ protected ExtractorFactory getExtractorFactory() { public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { assertExtract("/microformats2/h-geo/h-geo-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 4); + assertStatementsSize(null, null, null, 8); } } diff --git a/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html b/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html index b5c095a85..5438b908e 100644 --- a/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html +++ b/test-resources/src/test/resources/microformats2/h-adr/h-adr-test.html @@ -18,16 +18,17 @@ - -
- 349/B - Batagama,North - Jaela - Western - 11325 - SL - 51.526421;-0.081067;25 -
+ +
+ 349/B + Batagama,North + Jaela + Western + 11325 + SL + 51.526421;-0.081067;25 + Home +
diff --git a/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html b/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html index c0181fe26..38d906fd0 100644 --- a/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html +++ b/test-resources/src/test/resources/microformats2/h-geo/h-geo-test.html @@ -20,13 +20,13 @@ -

- + 54.155278, -2.249722 694 - -

+ + + 51.513458;-0.14812;50 From 1616c17cb6497bcdf7947ee1048027f1b6d83a9f Mon Sep 17 00:00:00 2001 From: Nisala Nirmana Date: Mon, 6 Jul 2015 01:50:42 +0530 Subject: [PATCH 03/10] added extractors HRecipe and HItem --- .../java/org/apache/any23/vocab/HItem.java | 30 +++ .../html/microformats2/HItemExtractor.java | 85 ++++++++ .../microformats2/HItemExtractorFactory.java | 40 ++++ .../html/microformats2/HRecipeExtractor.java | 189 ++++++++++++++++++ .../HRecipeExtractorFactory.java | 57 ++++++ .../microformats2/HItemExtractorTest.java | 38 ++++ .../microformats2/HRecipeExtractorTest.java | 39 ++++ .../microformats2/h-item/h-item-test.html | 27 +++ .../microformats2/h-recipe/h-recipe-test.html | 71 +++++++ 9 files changed, 576 insertions(+) create mode 100644 api/src/main/java/org/apache/any23/vocab/HItem.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractorFactory.java create mode 100644 core/src/test/java/org/apache/any23/extractor/html/microformats2/HItemExtractorTest.java create mode 100644 core/src/test/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractorTest.java create mode 100644 test-resources/src/test/resources/microformats2/h-item/h-item-test.html create mode 100644 test-resources/src/test/resources/microformats2/h-recipe/h-recipe-test.html diff --git a/api/src/main/java/org/apache/any23/vocab/HItem.java b/api/src/main/java/org/apache/any23/vocab/HItem.java new file mode 100644 index 000000000..db54e65ae --- /dev/null +++ b/api/src/main/java/org/apache/any23/vocab/HItem.java @@ -0,0 +1,30 @@ +package org.apache.any23.vocab; + +import org.openrdf.model.URI; + +/** + * Vocabulary to map the h-item microformat. + * + * @author Nisala Nirmana + */ +public class HItem extends Vocabulary { + + public static final String NS = SINDICE.NS + "hitem/"; + + private static HItem instance; + + public static HItem getInstance() { + if(instance == null) { + instance = new HItem(); + } + return instance; + } + + public URI Item = createClass(NS, "Item"); + public URI name = createProperty(NS, "name"); + public URI url = createProperty(NS, "url"); + public URI photo = createProperty(NS, "photo"); + private HItem() { + super(NS); + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractor.java new file mode 100644 index 000000000..19ed75724 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractor.java @@ -0,0 +1,85 @@ +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.TagSoupExtractionResult; +import org.apache.any23.vocab.HItem; +import org.openrdf.model.BNode; +import org.openrdf.model.URI; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; + +/** + * Extractor for the h-item + * microformat. + * + * @author Nisala Nirmana + */ +public class HItemExtractor extends EntityBasedMicroformatExtractor { + + private static final HItem vHITEM = HItem.getInstance(); + + private static final String[] itemFields = { + "name", + "url", + "photo" + }; + + @Override + public ExtractorDescription getDescription() { + return HItemExtractorFactory.getDescriptionInstance(); + } + + protected String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX+"item"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException{ + if (null == node) return false; + final HTMLDocument document = new HTMLDocument(node); + BNode item = getBlankNodeFor(node); + out.writeTriple(item, RDF.TYPE, vHITEM.Item); + final String extractorName = getDescription().getExtractorName(); + addName(document,item); + addPhotos(document,item); + addUrls(document,item); + final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult(); + tser.addResourceRoot(document.getPathToLocalRoot(), item, this.getClass()); + return true; + } + + private void mapFieldWithProperty(HTMLDocument fragment, BNode item, String fieldClass, URI property) { + HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); + conditionallyAddStringProperty( + title.source(),item, property, title.value() + ); + } + + private void addName(HTMLDocument fragment, BNode item) { + mapFieldWithProperty(fragment, item, Microformats2Prefixes.PROPERTY_PREFIX+itemFields[0], vHITEM.name); + } + + private void addPhotos(HTMLDocument fragment, BNode item) throws ExtractionException { + final HTMLDocument.TextField[] photos = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX+itemFields[2]); + for(HTMLDocument.TextField photo : photos) { + addURIProperty(item, vHITEM.photo, fragment.resolveURI(photo.value())); + } + } + + private void addUrls(HTMLDocument fragment, BNode item) throws ExtractionException { + HTMLDocument.TextField[] links = fragment.getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX+ + itemFields[1]); + for (HTMLDocument.TextField link : links) { + conditionallyAddResourceProperty(item, vHITEM.url, getHTMLDocument().resolveURI(link.value())); + } + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java new file mode 100644 index 000000000..842368675 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java @@ -0,0 +1,40 @@ +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HItemExtractorFactory extends SimpleExtractorFactory implements + ExtractorFactory { + + public static final String NAME = "html-mf2-h-item"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "vcard"); + + private static final ExtractorDescription descriptionInstance = new HItemExtractorFactory(); + + public HItemExtractorFactory() { + super( + HItemExtractorFactory.NAME, + HItemExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-item.html"); + } + + @Override + public HItemExtractor createExtractor() { + return new HItemExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} \ No newline at end of file diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractor.java new file mode 100644 index 000000000..d4bf12eed --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractor.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.vocab.HRecipe; +import org.openrdf.model.BNode; +import org.openrdf.model.URI; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; + +/** + * Extractor for the hRecipe + * microformat. + * + * @author Nisala Nirmana + */ +public class HRecipeExtractor extends EntityBasedMicroformatExtractor { + + private static final HRecipe vHRECIPE = HRecipe.getInstance(); + + private static final String[] recipeFields = { + "name", + "ingredient", + "yield", + "instructions", + "duration", + "photo", + "summary", + "author", + "published", + "nutrition" + }; + + @Override + public ExtractorDescription getDescription() { + return HRecipeExtractorFactory.getDescriptionInstance(); + } + + @Override + protected String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX+"recipe"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + @Override + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { + final BNode recipe = getBlankNodeFor(node); + conditionallyAddResourceProperty(recipe, RDF.TYPE, vHRECIPE.Recipe); + final HTMLDocument fragment = new HTMLDocument(node); + addName(fragment, recipe); + addIngredients(fragment, recipe); + addYield(fragment, recipe); + addInstructions(fragment, recipe); + addDurations(fragment, recipe); + addPhoto(fragment, recipe); + addSummary(fragment, recipe); + addAuthors(fragment, recipe); + addPublished(fragment, recipe); + addNutritions(fragment, recipe); + return true; + } + + private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, URI property) { + HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); + conditionallyAddStringProperty( + title.source(), recipe, property, title.value() + ); + } + + private void addName(HTMLDocument fragment, BNode recipe) { + mapFieldWithProperty(fragment, recipe, Microformats2Prefixes.PROPERTY_PREFIX + recipeFields[0], vHRECIPE.fn); + } + + private void addIngredients(HTMLDocument fragment, BNode recipe) { + final HTMLDocument.TextField[] ingredients = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX+recipeFields[1]); + for(HTMLDocument.TextField ingredient : ingredients) { + conditionallyAddStringProperty( + ingredient.source(), recipe, vHRECIPE.ingredient, ingredient.value() + ); + } + } + + private void addInstructions(HTMLDocument fragment, BNode recipe) { + mapFieldWithProperty(fragment, recipe, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX+recipeFields[2], + vHRECIPE.instructions); + } + + private void addYield(HTMLDocument fragment, BNode recipe) { + mapFieldWithProperty(fragment, recipe, Microformats2Prefixes.PROPERTY_PREFIX+recipeFields[3], vHRECIPE.yield); + } + + private void addDurations(HTMLDocument fragment, BNode recipe) { + final HTMLDocument.TextField[] durations = fragment.getPluralTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + recipeFields[4]); + for(HTMLDocument.TextField duration : durations) { + Node attribute=duration.source().getAttributes().getNamedItem("datetime"); + if (attribute==null){ + conditionallyAddStringProperty( + duration.source(), + recipe, vHRECIPE.duration, duration.value() + ); + }else{ + conditionallyAddStringProperty( + duration.source(), + recipe, vHRECIPE.duration, attribute.getNodeValue() + ); + + } + + } + } + + private void addPhoto(HTMLDocument fragment, BNode recipe) throws ExtractionException { + final HTMLDocument.TextField[] photos = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX+recipeFields[5]); + for(HTMLDocument.TextField photo : photos) { + addURIProperty(recipe, vHRECIPE.photo, fragment.resolveURI(photo.value())); + } + } + + private void addSummary(HTMLDocument fragment, BNode recipe) { + mapFieldWithProperty(fragment, recipe, Microformats2Prefixes.PROPERTY_PREFIX+recipeFields[6], vHRECIPE.summary); + } + + private void addAuthors(HTMLDocument fragment, BNode recipe) { + final HTMLDocument.TextField[] authors = fragment. + getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + recipeFields[7]); + for(HTMLDocument.TextField author : authors) { + conditionallyAddStringProperty( + author.source(), + recipe, vHRECIPE.author, author.value() + ); + } + } + + private void addPublished(HTMLDocument fragment, BNode recipe) { + final HTMLDocument.TextField[] durations = fragment.getPluralTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + recipeFields[8]); + for(HTMLDocument.TextField duration : durations) { + Node attribute=duration.source().getAttributes().getNamedItem("datetime"); + if (attribute==null){ + conditionallyAddStringProperty( + duration.source(), + recipe, vHRECIPE.published, duration.value() + ); + }else{ + conditionallyAddStringProperty( + duration.source(), + recipe, vHRECIPE.published, attribute.getNodeValue() + ); + } + } + } + + private void addNutritions(HTMLDocument fragment, BNode recipe) { + final HTMLDocument.TextField[] nutritions = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX+recipeFields[9]); + for(HTMLDocument.TextField nutrition : nutritions) { + conditionallyAddStringProperty( + nutrition.source(), recipe, vHRECIPE.nutrition, nutrition.value() + ); + } + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractorFactory.java new file mode 100644 index 000000000..2f61f514a --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HRecipeExtractorFactory extends SimpleExtractorFactory implements + ExtractorFactory { + + public static final String NAME = "html-mf2-h-recipe"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "hrecipe"); + + private static final ExtractorDescription descriptionInstance = new HRecipeExtractorFactory(); + + public HRecipeExtractorFactory() { + super( + HRecipeExtractorFactory.NAME, + HRecipeExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-recipe.html"); + } + + @Override + public HRecipeExtractor createExtractor() { + return new HRecipeExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HItemExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HItemExtractorTest.java new file mode 100644 index 000000000..81638902e --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HItemExtractorTest.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HItemExtractorTest extends AbstractExtractorTestCase { + + protected ExtractorFactory getExtractorFactory() { + return new HItemExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { + assertExtract("/microformats2/h-item/h-item-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 4); + } +} \ No newline at end of file diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractorTest.java new file mode 100644 index 000000000..883a630ad --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HRecipeExtractorTest.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HRecipeExtractorTest extends AbstractExtractorTestCase { + + protected ExtractorFactory getExtractorFactory() { + return new HRecipeExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { + assertExtract("/microformats2/h-recipe/h-recipe-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 15); + } + +} diff --git a/test-resources/src/test/resources/microformats2/h-item/h-item-test.html b/test-resources/src/test/resources/microformats2/h-item/h-item-test.html new file mode 100644 index 000000000..dc2b2c794 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-item/h-item-test.html @@ -0,0 +1,27 @@ + + + + + +
+ food + + Online Supermarket +
+ + diff --git a/test-resources/src/test/resources/microformats2/h-recipe/h-recipe-test.html b/test-resources/src/test/resources/microformats2/h-recipe/h-recipe-test.html new file mode 100644 index 000000000..20ea47bbe --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-recipe/h-recipe-test.html @@ -0,0 +1,71 @@ + + + + http://microformats.org/wiki/hrecipe Example 1 + + +
+ Yorkshire Puddings + Makes6 good sized Yorkshire puddings,the way my mum taught me + + +
+

Ingredients

+
    +
  • 1 egg
  • +
  • 75g plain flour
  • +
  • 70ml milk
  • +
  • 60ml water
  • +
  • Pinch of salt
  • +
+
+ +

Time

+
    +
  • Cook25 mins
  • +
+ + +

Instructions

+
+
    +
  1. Pre-heat oven to 230C or gas mark 8. Pour the vegetable oil evenly into 2 x 4-hole + Yorkshire pudding tins and place in the oven to heat through.
  2. + +
  3. To make the batter, add all the flour into a bowl and beat in the eggs until smooth. + Gradually add the milk and water while beating the mixture. It should be smooth and + without lumps. Finally add a pinch of salt.
  4. + +
  5. Make sure the oil is piping hot before pouring the batter evenly into the tins. + Place in the oven for 20-25 minutes until pudding have risen and look golden brown
  6. +
+
+ +

Nutrition

+
    +
  • Calories: 125
  • +
  • Fat: 3.2g
  • +
  • Cholesterol: 77mg
  • +
+ + Published on by + ` Glenn Jones + +
+ + From cc0dfbe8127a00fa712c7d2df6785a73c290feae Mon Sep 17 00:00:00 2001 From: Nisala Date: Mon, 20 Jul 2015 01:12:27 +0530 Subject: [PATCH 04/10] adding HEvent and HProduct Extractors --- .../java/org/apache/any23/vocab/HEvent.java | 57 +++++ .../java/org/apache/any23/vocab/HProduct.java | 58 ++++++ .../html/microformats2/HEventExtractor.java | 195 ++++++++++++++++++ .../microformats2/HEventExtractorFactory.java | 57 +++++ .../html/microformats2/HProductExtractor.java | 153 ++++++++++++++ .../HProductExtractorFactory.java | 56 +++++ .../apache/any23/prefixes/prefixes.properties | 2 + .../microformats2/HEventExtractorTest.java | 37 ++++ .../microformats2/HProductExtractorTest.java | 37 ++++ .../microformats2/h-event/h-event-test.html | 36 ++++ .../h-product/h-product-test.html | 36 ++++ 11 files changed, 724 insertions(+) create mode 100644 api/src/main/java/org/apache/any23/vocab/HEvent.java create mode 100644 api/src/main/java/org/apache/any23/vocab/HProduct.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractorFactory.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractorFactory.java create mode 100644 core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java create mode 100644 core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java create mode 100644 test-resources/src/test/resources/microformats2/h-event/h-event-test.html create mode 100644 test-resources/src/test/resources/microformats2/h-product/h-product-test.html diff --git a/api/src/main/java/org/apache/any23/vocab/HEvent.java b/api/src/main/java/org/apache/any23/vocab/HEvent.java new file mode 100644 index 000000000..b936c3eba --- /dev/null +++ b/api/src/main/java/org/apache/any23/vocab/HEvent.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.vocab; + +import org.openrdf.model.URI; + +/** + * Vocabulary to map the h-event microformat. + * + * @author Nisala Nirmana + */ +public class HEvent extends Vocabulary { + public static final String NS = SINDICE.NS + "hevent/"; + + private static HEvent instance; + + public static HEvent getInstance() { + if(instance == null) { + instance = new HEvent(); + } + return instance; + } + + public URI event = createClass(NS, "Event"); + + + public URI name = createProperty(NS, "name"); + public URI summary = createProperty(NS, "summary"); + public URI start = createProperty(NS, "start"); + public URI end = createProperty(NS, "end"); + public URI duration = createProperty(NS, "duration"); + public URI description = createProperty(NS, "description"); + public URI url = createProperty(NS, "url"); + public URI category = createProperty(NS, "category"); + public URI location = createProperty(NS, "location"); + public URI attendee = createProperty(NS, "attendee"); + + + private HEvent() { + super(NS); + } +} diff --git a/api/src/main/java/org/apache/any23/vocab/HProduct.java b/api/src/main/java/org/apache/any23/vocab/HProduct.java new file mode 100644 index 000000000..9630db315 --- /dev/null +++ b/api/src/main/java/org/apache/any23/vocab/HProduct.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.vocab; + +import org.openrdf.model.URI; + +/** + * Vocabulary to map the h-item microformat. + * + * @author Nisala Nirmana + */ + +public class HProduct extends Vocabulary { + public static final String NS = SINDICE.NS + "hproduct/"; + + private static HProduct instance; + + public static HProduct getInstance() { + if(instance == null) { + instance = new HProduct(); + } + return instance; + } + + public URI product = createClass(NS, "Product"); + + + public URI name = createProperty(NS, "name"); + public URI photo = createProperty(NS, "photo"); + public URI brand = createProperty(NS, "brand"); + public URI category = createProperty(NS, "category"); + public URI description = createProperty(NS, "description"); + public URI url = createProperty(NS, "url"); + public URI identifier = createProperty(NS, "identifier"); + public URI price = createProperty(NS, "price"); + public URI review = createProperty(NS, "review"); + + + private HProduct() { + super(NS); + } + +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java new file mode 100644 index 000000000..8ce70a6e6 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.TagSoupExtractionResult; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.vocab.HEvent; +import org.openrdf.model.BNode; +import org.openrdf.model.URI; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.HTMLDocument; + +import static org.apache.any23.extractor.html.HTMLDocument.TextField; + + +/** + * Extractor for the hCalendar + * microformat. + * + * @author Nisala Nirmana + */ +public class HEventExtractor extends EntityBasedMicroformatExtractor { + + private static final HEvent vEvent = HEvent.getInstance(); + + private String[] eventFields = { + "name", + "summary", + "start", + "end", + "duration", + "description", + "url", + "category", + "location", //toDO + "attendee" //toDO + }; + + + @Override + public ExtractorDescription getDescription() { + return HEventExtractorFactory.getDescriptionInstance(); + } + + @Override + protected String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX+"event"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + @Override + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { + final BNode event = getBlankNodeFor(node); + conditionallyAddResourceProperty(event, RDF.TYPE, vEvent.event); + final HTMLDocument fragment = new HTMLDocument(node); + addName(fragment, event); + addSummary(fragment, event); + addStart(fragment, event); + addEnd(fragment, event); + addDuration(fragment, event); + addDescription(fragment, event); + addURLs(fragment, event); + addCategories(fragment, event); + addLocation(fragment, event); + + return true; + } + + private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, + URI property) { + HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); + conditionallyAddStringProperty( + title.source(), recipe, property, title.value() + ); + } + + private void addName(HTMLDocument fragment, BNode event) { + mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + + eventFields[0], vEvent.name); + } + + private void addSummary(HTMLDocument fragment, BNode event) { + mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + + eventFields[1], vEvent.summary); + } + + private void addStart(HTMLDocument fragment, BNode event) { + final TextField start = fragment.getSingularTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + eventFields[2]); + if(start.source()==null) + return; + Node attribute = start.source().getAttributes().getNamedItem("datetime"); + if (attribute == null) { + conditionallyAddStringProperty( + start.source(), + event, vEvent.start, start.value() + ); + } else { + conditionallyAddStringProperty( + start.source(), + event, vEvent.start, attribute.getNodeValue() + ); + } + } + + private void addEnd(HTMLDocument fragment, BNode event) { + final TextField end = fragment.getSingularTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + eventFields[3]); + if(end.source()==null) + return; + Node attribute = end.source().getAttributes().getNamedItem("datetime"); + if (attribute == null) { + conditionallyAddStringProperty( + end.source(), + event, vEvent.end, end.value() + ); + } else { + conditionallyAddStringProperty( + end.source(), + event, vEvent.end, attribute.getNodeValue() + ); + } + } + + private void addDuration(HTMLDocument fragment, BNode event) { + final TextField duration = fragment.getSingularTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + eventFields[4]); + if(duration.source()==null) + return; + Node attribute = duration.source().getAttributes().getNamedItem("datetime"); + if (attribute == null) { + conditionallyAddStringProperty( + duration.source(), + event, vEvent.duration, duration.value() + ); + } else { + conditionallyAddStringProperty( + duration.source(), + event, vEvent.duration, attribute.getNodeValue() + ); + } + } + + private void addDescription(HTMLDocument fragment, BNode event) { + mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + + eventFields[5], vEvent.description); + } + + private void addURLs(HTMLDocument fragment, BNode event) throws ExtractionException { + final HTMLDocument.TextField[] urls = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + eventFields[6]); + for(HTMLDocument.TextField url : urls) { + addURIProperty(event, vEvent.url, fragment.resolveURI(url.value())); + } + } + + private void addCategories(HTMLDocument fragment, BNode event) { + final HTMLDocument.TextField[] categories = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + eventFields[7]); + for(HTMLDocument.TextField category : categories) { + conditionallyAddStringProperty( + category.source(), event, vEvent.category, category.value() + ); + } + } + + private void addLocation(HTMLDocument fragment, BNode event) { + mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + + eventFields[8], vEvent.location); + } + +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractorFactory.java new file mode 100644 index 000000000..602b04430 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Peter Ansell p_ansell@yahoo.com + * + */ +public class HEventExtractorFactory extends SimpleExtractorFactory implements + ExtractorFactory { + + public static final String NAME = "html-mf2-h-event"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "hevent"); + + private static final ExtractorDescription descriptionInstance = new HEventExtractorFactory(); + + public HEventExtractorFactory() { + super( + HEventExtractorFactory.NAME, + HEventExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-event.html"); + } + + @Override + public HEventExtractor createExtractor() { + return new HEventExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java new file mode 100644 index 000000000..0e93935b2 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; +import org.apache.any23.vocab.HProduct; +import org.openrdf.model.BNode; +import org.openrdf.model.URI; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; + +/** + * Extractor for the h-product + * microformat. + * + * @author Nisala Nirmana + */ +public class HProductExtractor extends EntityBasedMicroformatExtractor { + + private static final HProduct vProduct = HProduct.getInstance(); + + private static final String[] productFields = { + "name", + "photo", + "brand", //toDo + "category", + "description", + "url", + "identifier", + "review", //toDo + "price" + }; + + @Override + public ExtractorDescription getDescription() { + return HProductExtractorFactory.getDescriptionInstance(); + } + + @Override + protected String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX+"product"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + @Override + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { + final BNode product = getBlankNodeFor(node); + conditionallyAddResourceProperty(product, RDF.TYPE, vProduct.product); + final HTMLDocument fragment = new HTMLDocument(node); + addName(fragment, product); + addPhoto(fragment, product); + addCategories(fragment, product); + addDescription(fragment, product); + addURLs(fragment, product); + addIdentifiers(fragment, product); + addPrice(fragment, product); + return true; + } + + private void mapFieldWithProperty(HTMLDocument fragment, BNode product, String fieldClass, + URI property) { + HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); + conditionallyAddStringProperty( + title.source(), product, property, title.value() + ); + } + + private void addName(HTMLDocument fragment, BNode product) { + mapFieldWithProperty(fragment, product, Microformats2Prefixes.PROPERTY_PREFIX + + productFields[0], vProduct.name); + } + + private void addPhoto(HTMLDocument fragment, BNode product) throws ExtractionException { + final HTMLDocument.TextField[] photos = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[1]); + for(HTMLDocument.TextField photo : photos) { + addURIProperty(product, vProduct.photo, fragment.resolveURI(photo.value())); + } + } + + private void addCategories(HTMLDocument fragment, BNode product) { + final HTMLDocument.TextField[] categories = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + productFields[3]); + for(HTMLDocument.TextField category : categories) { + conditionallyAddStringProperty( + category.source(), product, vProduct.category, category.value() + ); + } + } + + private void addDescription(HTMLDocument fragment, BNode product) { + mapFieldWithProperty(fragment, product, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + + productFields[4], vProduct.description); + } + + private void addURLs(HTMLDocument fragment, BNode product) throws ExtractionException { + final HTMLDocument.TextField[] urls = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[5]); + for(HTMLDocument.TextField url : urls) { + addURIProperty(product, vProduct.url, fragment.resolveURI(url.value())); + } + } + + private void addIdentifiers(HTMLDocument fragment, BNode product) throws ExtractionException { + final HTMLDocument.TextField[] identifiers = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[6]); + for(HTMLDocument.TextField identifier :identifiers) { + addURIProperty(product, vProduct.identifier, fragment.resolveURI(identifier.value())); + } + } + + private void addPrice(HTMLDocument fragment, BNode product) { + final HTMLDocument.TextField price = fragment.getSingularTextField( + Microformats2Prefixes.PROPERTY_PREFIX + productFields[8]); + if(price.source()==null) + return; + Node attribute = price.source().getAttributes().getNamedItem("value"); + if (attribute == null) { + conditionallyAddStringProperty( + price.source(), + product, vProduct.price, price.value() + ); + } else { + conditionallyAddStringProperty( + price.source(), + product, vProduct.price, attribute.getNodeValue() + ); + } + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractorFactory.java new file mode 100644 index 000000000..f4b65d944 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractorFactory.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; +import java.util.Arrays; + +/** + * @author Nisala Nirmana + * + */ +public class HProductExtractorFactory extends SimpleExtractorFactory implements + ExtractorFactory { + + public static final String NAME = "html-mf2-h-product"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "hproduct"); + + private static final ExtractorDescription descriptionInstance = new HProductExtractorFactory(); + + public HProductExtractorFactory() { + super( + HProductExtractorFactory.NAME, + HProductExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-product.html"); + } + + @Override + public HProductExtractor createExtractor() { + return new HProductExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} diff --git a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties index 58516ec55..34e397523 100644 --- a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties +++ b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties @@ -32,6 +32,8 @@ ex=http://example.com/ns# wo=http://purl.org/ontology/wo/ skos=http://www.w3.org/2004/02/skos/core# hrecipe=http://sindice.com/hrecipe/ +hevent=http://sindice.com/hevent/ +hproduct=http://sindice.com/hproduct/ sindice=http://vocab.sindice.net/ og=http://opengraphprotocol.org/schema/ fb=http://www.facebook.com/2008/fbml# diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java new file mode 100644 index 000000000..6c1390980 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HEventExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory getExtractorFactory() { + return new HEventExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { + assertExtract("/microformats2/h-event/h-event-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 9); + } +} diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java new file mode 100644 index 000000000..3b46a7a02 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HProductExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory getExtractorFactory() { + return new HProductExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { + assertExtract("/microformats2/h-product/h-product-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 11); + } +} \ No newline at end of file diff --git a/test-resources/src/test/resources/microformats2/h-event/h-event-test.html b/test-resources/src/test/resources/microformats2/h-event/h-event-test.html new file mode 100644 index 000000000..b8af9de29 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-event/h-event-test.html @@ -0,0 +1,36 @@ + + + + + + + +
+

Microformats Meetup

+ Official event web site +

From + + to + at Some bar in SF

+

Get together and discuss all things microformats-related.

+

This technical meetup is hosted in aid of discussion related to new draft specification of microformats 2

+
+ + + + diff --git a/test-resources/src/test/resources/microformats2/h-product/h-product-test.html b/test-resources/src/test/resources/microformats2/h-product/h-product-test.html new file mode 100644 index 000000000..08ead4f71 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-product/h-product-test.html @@ -0,0 +1,36 @@ + + + + + + + +
+

Microformats For Dummies

+ +
+

Want to get started using microformats, but intimidated by hyphens and mediawiki? This book + contains everything you need to know!

+
+

Yours today for only $20.00 + from ACME Publishing inc.

+
+ + + + From 817029a862a2beeec06c30a4194963c3efb331d3 Mon Sep 17 00:00:00 2001 From: Nisala Date: Mon, 20 Jul 2015 01:19:21 +0530 Subject: [PATCH 05/10] correction to comments --- api/src/main/java/org/apache/any23/vocab/HProduct.java | 2 +- .../any23/extractor/html/microformats2/HEventExtractor.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api/src/main/java/org/apache/any23/vocab/HProduct.java b/api/src/main/java/org/apache/any23/vocab/HProduct.java index 9630db315..bbbaf3258 100644 --- a/api/src/main/java/org/apache/any23/vocab/HProduct.java +++ b/api/src/main/java/org/apache/any23/vocab/HProduct.java @@ -20,7 +20,7 @@ import org.openrdf.model.URI; /** - * Vocabulary to map the h-item microformat. + * Vocabulary to map the h-product microformat. * * @author Nisala Nirmana */ diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java index 8ce70a6e6..ce67d8648 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java @@ -33,7 +33,7 @@ /** - * Extractor for the hCalendar + * Extractor for the h-event * microformat. * * @author Nisala Nirmana From 0008c7c770c8626b766da24522d9e12a8dc97215 Mon Sep 17 00:00:00 2001 From: Nisala Date: Mon, 20 Jul 2015 01:22:59 +0530 Subject: [PATCH 06/10] author correction --- .../extractor/html/microformats2/HEventExtractorFactory.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractorFactory.java index 602b04430..ef62f50b1 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractorFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractorFactory.java @@ -26,7 +26,7 @@ import org.apache.any23.rdf.Prefixes; /** - * @author Peter Ansell p_ansell@yahoo.com + * @author Nisala Nirmana * */ public class HEventExtractorFactory extends SimpleExtractorFactory implements From 417b71a757ecb444a98cebeb25f48faa1c27524f Mon Sep 17 00:00:00 2001 From: Nisala Date: Sun, 23 Aug 2015 21:39:34 +0530 Subject: [PATCH 07/10] adding HEntry and HResume extractors --- .../java/org/apache/any23/vocab/HEntry.java | 60 +++++ .../java/org/apache/any23/vocab/HItem.java | 17 ++ .../java/org/apache/any23/vocab/HResume.java | 54 ++++ .../extractor/html/MicroformatExtractor.java | 5 + .../html/microformats2/HEntryExtractor.java | 234 ++++++++++++++++++ .../microformats2/HEntryExtractorFactory.java | 60 +++++ .../html/microformats2/HEventExtractor.java | 17 ++ .../microformats2/HItemExtractorFactory.java | 2 +- .../html/microformats2/HResumeExtractor.java | 162 ++++++++++++ .../HResumeExtractorFactory.java | 57 +++++ .../microformats2/Microformats2Prefixes.java | 1 + .../apache/any23/prefixes/prefixes.properties | 2 + .../microformats2/HEntryExtractorTest.java | 37 +++ .../microformats2/HProductExtractorTest.java | 2 +- .../microformats2/HResumeExtractorTest.java | 37 +++ .../any23/vocab/RDFSchemaUtilsTest.java | 4 +- .../microformats2/h-entry/h-entry-test.html | 53 ++++ .../microformats2/h-resume/h-resume-test.html | 49 ++++ 18 files changed, 849 insertions(+), 4 deletions(-) create mode 100644 api/src/main/java/org/apache/any23/vocab/HEntry.java create mode 100644 api/src/main/java/org/apache/any23/vocab/HResume.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java create mode 100644 core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java create mode 100644 core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java create mode 100644 test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html create mode 100644 test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html diff --git a/api/src/main/java/org/apache/any23/vocab/HEntry.java b/api/src/main/java/org/apache/any23/vocab/HEntry.java new file mode 100644 index 000000000..e63907bfa --- /dev/null +++ b/api/src/main/java/org/apache/any23/vocab/HEntry.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.vocab; + +import org.openrdf.model.URI; + +/** + * Vocabulary to map the h-entry microformat. + * + * @author Nisala Nirmana + */ +public class HEntry extends Vocabulary { + + public static final String NS = SINDICE.NS + "hentry/"; + + private static HEntry instance; + + public static HEntry getInstance() { + if(instance == null) { + instance = new HEntry(); + } + return instance; + } + + public URI Entry = createClass(NS, "Entry"); + public URI author = createClass(NS, "author"); + public URI location = createClass(NS, "location"); + + + public URI name = createProperty(NS, "name"); + public URI summary = createProperty(NS, "summary"); + public URI content = createProperty(NS, "content"); + public URI published = createProperty(NS, "published"); + public URI updated = createProperty(NS, "updated"); + public URI category = createProperty(NS, "category"); + public URI url = createProperty(NS, "url"); + public URI uid = createProperty(NS, "uid"); + public URI syndication = createProperty(NS, "syndication"); + public URI in_reply_to = createProperty(NS, "in-reply-to"); + + private HEntry() { + super(NS); + } + +} diff --git a/api/src/main/java/org/apache/any23/vocab/HItem.java b/api/src/main/java/org/apache/any23/vocab/HItem.java index db54e65ae..01bc5a215 100644 --- a/api/src/main/java/org/apache/any23/vocab/HItem.java +++ b/api/src/main/java/org/apache/any23/vocab/HItem.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.any23.vocab; import org.openrdf.model.URI; diff --git a/api/src/main/java/org/apache/any23/vocab/HResume.java b/api/src/main/java/org/apache/any23/vocab/HResume.java new file mode 100644 index 000000000..1a50157f5 --- /dev/null +++ b/api/src/main/java/org/apache/any23/vocab/HResume.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.vocab; + +import org.openrdf.model.URI; + +/** + * @author Nisala Nirmana + * + */ +public class HResume extends Vocabulary { + + public static final String NS = SINDICE.NS + "hresume/"; + + private static HResume instance; + + public static HResume getInstance() { + if(instance == null) { + instance = new HResume(); + } + return instance; + } + + public URI Resume = createClass(NS, "Resume"); + public URI education = createClass(NS, "education"); + public URI experience = createClass(NS, "experience"); + public URI contact = createClass(NS, "contact"); + public URI affiliation = createClass(NS, "affiliation"); + + + public URI name = createProperty(NS, "name"); + public URI summary = createProperty(NS, "summary"); + public URI skill = createProperty(NS, "skill"); + + + private HResume() { + super(NS); + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java index 51ee9107c..4de6e216b 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/MicroformatExtractor.java @@ -113,6 +113,10 @@ protected ExtractionResult getCurrentExtractionResult() { return out; } + protected void setCurrentExtractionResult(ExtractionResult out) { + this.out = out; + } + protected ExtractionResult openSubResult(ExtractionContext context) { return out.openSubResult(context); } @@ -265,4 +269,5 @@ public static boolean includes( return false; } + } \ No newline at end of file diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java new file mode 100644 index 000000000..8c0c50f79 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.HTMLDocument; +import org.apache.any23.vocab.HEntry; +import org.apache.any23.vocab.VCard; +import org.openrdf.model.BNode; +import org.openrdf.model.URI; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.openrdf.model.Resource; + +import java.util.List; + +/** + * Extractor for the h-entry + * microformat. + * + * @author Nisala Nirmana + */ +public class HEntryExtractor extends EntityBasedMicroformatExtractor { + + private static final HEntry vEntry = HEntry.getInstance(); + private static final VCard vVCARD = VCard.getInstance(); + + private static final String[] entryFields = { + "name", + "summary", + "content", + "published", + "updated", + "category", + "url", + "uid", + "syndication", + "in-reply-to", + "author", //toDo HCard + "location", + + }; + + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" + }; + + @Override + public ExtractorDescription getDescription() { + return HEntryExtractorFactory.getDescriptionInstance(); + } + + @Override + protected String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX+"entry"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + @Override + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { + final BNode entry = getBlankNodeFor(node); + conditionallyAddResourceProperty(entry, RDF.TYPE, vEntry.Entry); + final HTMLDocument fragment = new HTMLDocument(node); + addName(fragment, entry); + addSummary(fragment, entry); + addContent(fragment, entry); + addPublished(fragment, entry); + addUpdated(fragment, entry); + addCategories(fragment, entry); + addURLs(fragment, entry); + addUID(fragment, entry); + addSyndications(fragment, entry); + addInReplyTo(fragment, entry); + addLocations(fragment,entry); + return true; + } + + private void mapFieldWithProperty(HTMLDocument fragment, BNode entry, String fieldClass, + URI property) { + HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); + conditionallyAddStringProperty( + title.source(), entry, property, title.value() + ); + } + + private void addName(HTMLDocument fragment, BNode entry) { + mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + + entryFields[0], vEntry.name); + } + + private void addSummary(HTMLDocument fragment, BNode entry) { + mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + entryFields[1], + vEntry.summary); + } + + private void addContent(HTMLDocument fragment, BNode entry) { + mapFieldWithProperty(fragment, entry, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + entryFields[2], + vEntry.content); + } + + private void addPublished(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] durations = fragment.getPluralTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[3]); + for(HTMLDocument.TextField duration : durations) { + Node attribute=duration.source().getAttributes().getNamedItem("datetime"); + if (attribute==null){ + conditionallyAddStringProperty( + duration.source(), + entry, vEntry.published, duration.value() + ); + }else{ + conditionallyAddStringProperty( + duration.source(), + entry, vEntry.published, attribute.getNodeValue() + ); + } + } + } + + private void addUpdated(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] durations = fragment.getPluralTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[4]); + for(HTMLDocument.TextField duration : durations) { + Node attribute=duration.source().getAttributes().getNamedItem("datetime"); + if (attribute==null){ + conditionallyAddStringProperty( + duration.source(), + entry, vEntry.updated, duration.value() + ); + }else{ + conditionallyAddStringProperty( + duration.source(), + entry, vEntry.updated, attribute.getNodeValue() + ); + } + } + } + + private void addCategories(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] categories = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + entryFields[5]); + for (HTMLDocument.TextField category : categories) { + conditionallyAddStringProperty( + category.source(), entry, vEntry.category, category.value() + ); + } + } + + private void addURLs(HTMLDocument fragment, BNode entry) throws ExtractionException { + final HTMLDocument.TextField[] urls = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[6]); + for(HTMLDocument.TextField url : urls) { + addURIProperty(entry, vEntry.url, fragment.resolveURI(url.value())); + } + } + + private void addUID(HTMLDocument fragment, BNode entry) throws ExtractionException { + final HTMLDocument.TextField uid = fragment.getSingularTextField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[7]); + if(uid.source()==null) + return; + addURIProperty(entry, vEntry.uid, fragment.resolveURI(uid.value())); + } + + private void addSyndications(HTMLDocument fragment, BNode entry) throws ExtractionException { + final HTMLDocument.TextField[] syndications = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[8]); + for(HTMLDocument.TextField syndication : syndications) { + addURIProperty(entry, vEntry.syndication, fragment.resolveURI(syndication.value())); + } + } + + private void addInReplyTo(HTMLDocument fragment, BNode entry) throws ExtractionException { + final HTMLDocument.TextField inReplyTo = fragment.getSingularTextField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[9]); + if(inReplyTo.source()==null) + return; + addURIProperty(entry, vEntry.in_reply_to, fragment.resolveURI(inReplyTo.value())); + } + + private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[11] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "geo"); + if (nodes.isEmpty()) + return; + for (Node node : nodes) { + BNode location = valueFactory.createBNode(); + addURIProperty(location, RDF.TYPE, vEntry.location); + HTMLDocument fragment = new HTMLDocument(node); + for (String field : geoFields) { + HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); + for (HTMLDocument.TextField val : values) { + Node attribute=val.source().getAttributes().getNamedItem("title"); + if (attribute==null){ + conditionallyAddStringProperty( + val.source(), + location, vVCARD.getProperty(field), val.value() + ); + }else{ + conditionallyAddStringProperty( + val.source(), + location, vVCARD.getProperty(field), attribute.getNodeValue() + ); + } + } + } + } + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java new file mode 100644 index 000000000..e2d45560a --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorFactory.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +import java.util.Arrays; + +/** + * Extractor for the h-entry + * microformat. + * + * @author Nisala Nirmana + */ +public class HEntryExtractorFactory extends SimpleExtractorFactory implements + ExtractorFactory { + + public static final String NAME = "html-mf2-h-entry"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "hentry"); + + private static final ExtractorDescription descriptionInstance = new HEntryExtractorFactory(); + + public HEntryExtractorFactory() { + super( + HEntryExtractorFactory.NAME, + HEntryExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-entry.html"); + } + + @Override + public HEntryExtractor createExtractor() { + return new HEntryExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java index ce67d8648..ea907161d 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java @@ -24,6 +24,7 @@ import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; import org.apache.any23.vocab.HEvent; import org.openrdf.model.BNode; +import org.openrdf.model.Resource; import org.openrdf.model.URI; import org.openrdf.model.vocabulary.RDF; import org.w3c.dom.Node; @@ -89,6 +90,22 @@ protected boolean extractEntity(Node node, ExtractionResult out) throws Extracti return true; } + public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode event, + ExtractionResult out) + throws ExtractionException { + this.setCurrentExtractionResult(out); + addName(fragment, event); + addSummary(fragment, event); + addStart(fragment, event); + addEnd(fragment, event); + addDuration(fragment, event); + addDescription(fragment, event); + addURLs(fragment, event); + addCategories(fragment, event); + addLocation(fragment, event); + return event; + } + private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, URI property) { HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java index 842368675..14f20bdad 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java @@ -17,7 +17,7 @@ public class HItemExtractorFactory extends SimpleExtractorFactoryhResume + * microformat. + * + * @author Nisala Nirmana + */ +public class HResumeExtractor extends EntityBasedMicroformatExtractor { + + private static final HResume vResume = HResume.getInstance(); + + private static final String[] resumeFields = { + "name", + "summary", + "contact",//toDo Hcard + "education", + "experience", + "skill", + "affiliation"//toDo Hcard + }; + + @Override + public ExtractorDescription getDescription() { + return HResumeExtractorFactory.getDescriptionInstance(); + } + + @Override + public String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX + "resume"; + } + + @Override + protected void resetExtractor() { + // Empty. + } + + @Override + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { + if (null == node) return false; + BNode person = getBlankNodeFor(node); + out.writeTriple(person, RDF.TYPE, vResume.Resume); + final HTMLDocument fragment = new HTMLDocument(node); + + addName(fragment, person); + addSummary(fragment, person); + addSkills(fragment, person); + + addExperiences(fragment, person); + addEducations(fragment, person); + + + final TagSoupExtractionResult tser = (TagSoupExtractionResult) out; + tser.addResourceRoot( + DomUtils.getXPathListForNode(node), + person, + this.getClass() + ); + + return true; + } + + private void addName(HTMLDocument doc, Resource person) { + HTMLDocument.TextField name = doc.getSingularTextField( + Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[0]); + conditionallyAddStringProperty( + name.source(), + person, + vResume.name, + name.value() + ); + } + + private void addSummary(HTMLDocument doc, Resource person) { + HTMLDocument.TextField summary = doc.getSingularTextField( + Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[1]); + conditionallyAddStringProperty( + summary.source(), + person, + vResume.summary, + summary.value() + ); + } + + private void addSkills(HTMLDocument doc, Resource person) { + final HTMLDocument.TextField[] skills = doc.getPluralTextField( + Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[5]); + for (HTMLDocument.TextField skill : skills) { + conditionallyAddStringProperty( + skill.source(), + person, + vResume.skill, + skill.value() + ); + } + + } + + private void addExperiences(HTMLDocument doc, Resource person) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[4] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event"); + if (nodes.isEmpty()) + return; + HEventExtractorFactory factory = new HEventExtractorFactory(); + HEventExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode event = valueFactory.createBNode(); + addURIProperty(event, RDF.TYPE, vResume.experience); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event, + getCurrentExtractionResult()); + } + } + + private void addEducations(HTMLDocument doc, Resource person) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[3] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event"); + if (nodes.isEmpty()) + return; + HEventExtractorFactory factory = new HEventExtractorFactory(); + HEventExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode event = valueFactory.createBNode(); + addURIProperty(event, RDF.TYPE, vResume.education); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event, + getCurrentExtractionResult()); + } + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java new file mode 100644 index 000000000..a8120ebee --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HResumeExtractorFactory extends SimpleExtractorFactory implements + ExtractorFactory { + + public static final String NAME = "html-mf2-h-resume"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "doac", "foaf"); + + private static final ExtractorDescription descriptionInstance = new HResumeExtractorFactory(); + + public HResumeExtractorFactory() { + super( + HResumeExtractorFactory.NAME, + HResumeExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-resume.html"); + } + + @Override + public HResumeExtractor createExtractor() { + return new HResumeExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java index 18ac1b151..d6b33495f 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/Microformats2Prefixes.java @@ -23,4 +23,5 @@ public class Microformats2Prefixes { public static final String URL_PROPERTY_PREFIX = "u-"; public static final String EMBEDDED_PROPERTY_PREFIX = "e-"; public static final String TIME_PROPERTY_PREFIX = "dt-"; + public static final String SPACE_SEPARATOR = " "; } \ No newline at end of file diff --git a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties index 34e397523..c7eaf545e 100644 --- a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties +++ b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties @@ -34,6 +34,8 @@ skos=http://www.w3.org/2004/02/skos/core# hrecipe=http://sindice.com/hrecipe/ hevent=http://sindice.com/hevent/ hproduct=http://sindice.com/hproduct/ +hitem=http://sindice.com/hitem/ +hentry=http://sindice.com/hentry/ sindice=http://vocab.sindice.net/ og=http://opengraphprotocol.org/schema/ fb=http://www.facebook.com/2008/fbml# diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java new file mode 100644 index 000000000..cc2974d19 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HEntryExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory getExtractorFactory() { + return new HEntryExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { + assertExtract("/microformats2/h-entry/h-entry-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 10); + } +} \ No newline at end of file diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java index 3b46a7a02..49c1755cf 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HProductExtractorTest.java @@ -32,6 +32,6 @@ protected ExtractorFactory getExtractorFactory() { public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { assertExtract("/microformats2/h-product/h-product-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 11); + assertStatementsSize(null, null, null, 6); } } \ No newline at end of file diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java new file mode 100644 index 000000000..dd2f5d186 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HResumeExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HResumeExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory getExtractorFactory() { + return new HResumeExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { + assertExtract("/microformats2/h-resume/h-resume-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 12); + } +} \ No newline at end of file diff --git a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java index b4f8b7a5c..c58e2a106 100644 --- a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java +++ b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java @@ -43,7 +43,7 @@ public class RDFSchemaUtilsTest { */ @Test public void testSerializeVocabulariesNTriples() { - serializeVocabularies(RDFFormat.NTRIPLES, 1920); + serializeVocabularies(RDFFormat.NTRIPLES, 2012);//1920 } /** @@ -53,7 +53,7 @@ public void testSerializeVocabulariesNTriples() { */ @Test public void testSerializeVocabulariesRDFXML() { - serializeVocabularies(RDFFormat.RDFXML, 4992); // Effective lines + separators. + serializeVocabularies(RDFFormat.RDFXML, 5252); // Effective lines + separators. //4992 } private void serializeVocabularies(RDFFormat format, int expectedLines) { diff --git a/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html new file mode 100644 index 000000000..f3c8cf791 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html @@ -0,0 +1,53 @@ + + + + + +
+

microformats.org at 7

+ +

Published + +

+ +
+

Last week the microformats.org community + celebrated its 7th birthday at a gathering hosted by Mozilla in + San Francisco and recognized accomplishments, challenges, and + opportunities.

+ +

The microformats tagline “humans first, machines second” + forms the basis of many of our + principles, and + in that regard, we’d like to recognize a few people and + thank them for their years of volunteer service

+
+ +

Updated + +

+ +
+

Location + N 37° 24.491, + W 122° 08.313 +

+
+
+ + + + diff --git a/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html b/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html new file mode 100644 index 000000000..15dd83537 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-resume/h-resume-test.html @@ -0,0 +1,49 @@ + + + + + +
+

Tim Berners-Lee

+ +

Invented the World Wide Web.


+ +
+ Education : + + +
+ +
+

Experiance : + Present + +

+
+ +
+ Skills: +
    +
  • information systems
  • +
  • advocacy
  • +
  • leadership
  • +
      +
+ +
+ + + From cf48a5bf88b40bc327108a4daa857e14d914d654 Mon Sep 17 00:00:00 2001 From: Nisala Date: Wed, 26 Aug 2015 23:41:22 +0530 Subject: [PATCH 08/10] add HCard extractor and completed all the toDos related to hcard dependencies --- .../java/org/apache/any23/vocab/HCard.java | 86 ++++ .../html/microformats2/HCardExtractor.java | 450 ++++++++++++++++++ .../microformats2/HCardExtractorFactory.java | 57 +++ .../html/microformats2/HEntryExtractor.java | 20 +- .../html/microformats2/HEventExtractor.java | 64 ++- .../html/microformats2/HProductExtractor.java | 21 +- .../html/microformats2/HResumeExtractor.java | 37 +- .../apache/any23/prefixes/prefixes.properties | 1 + .../html/microformats2/HAdrExtractorTest.java | 2 +- .../microformats2/HCardExtractorTest.java | 37 ++ .../microformats2/HEntryExtractorTest.java | 2 +- .../microformats2/HEventExtractorTest.java | 2 +- .../any23/vocab/RDFSchemaUtilsTest.java | 4 +- .../microformats2/h-card/h-card-test.html | 45 ++ .../microformats2/h-entry/h-entry-test.html | 21 + 15 files changed, 829 insertions(+), 20 deletions(-) create mode 100644 api/src/main/java/org/apache/any23/vocab/HCard.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractorFactory.java create mode 100644 core/src/test/java/org/apache/any23/extractor/html/microformats2/HCardExtractorTest.java create mode 100644 test-resources/src/test/resources/microformats2/h-card/h-card-test.html diff --git a/api/src/main/java/org/apache/any23/vocab/HCard.java b/api/src/main/java/org/apache/any23/vocab/HCard.java new file mode 100644 index 000000000..b22e58c02 --- /dev/null +++ b/api/src/main/java/org/apache/any23/vocab/HCard.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.vocab; + +import org.openrdf.model.URI; + +/** + * Vocabulary to map the h-card microformat. + * + * @author Nisala Nirmana + */ +public class HCard extends Vocabulary { + public static final String NS = SINDICE.NS + "hcard/"; + + private static HCard instance; + + public static HCard getInstance() { + if(instance == null) { + instance = new HCard(); + } + return instance; + } + + public URI Card = createClass(NS, "Card"); + public URI Address = createClass(NS, "Address"); + public URI Geo = createClass(NS, "Geo"); + + + public URI name = createProperty(NS, "name"); + public URI honorific_prefix = createProperty(NS, "honorific-prefix"); + public URI given_name = createProperty(NS, "given-name"); + public URI additional_name = createProperty(NS, "additional-name"); + public URI family_name = createProperty(NS, "family-name"); + public URI sort_string = createProperty(NS, "sort-string"); + public URI honorific_suffix = createProperty(NS, "honorific-suffix"); + public URI nickname = createProperty(NS, "nickname"); + public URI email = createProperty(NS, "email"); + public URI logo = createProperty(NS, "logo"); + public URI photo = createProperty(NS, "photo"); + public URI url = createProperty(NS, "url"); + public URI uid = createProperty(NS, "uid"); + public URI category = createProperty(NS, "category"); + public URI tel = createProperty(NS, "tel"); + public URI note = createProperty(NS, "note"); + public URI bday = createProperty(NS, "bday"); + public URI key = createProperty(NS, "key"); + public URI org = createProperty(NS, "org"); + public URI job_title = createProperty(NS, "job-title"); + public URI role = createProperty(NS, "role"); + public URI impp = createProperty(NS, "impp"); + public URI sex = createProperty(NS, "sex"); + public URI gender_identity = createProperty(NS, "gender-identity"); + public URI anniversary = createProperty(NS, "anniversary"); + public URI geo = createProperty(NS, "geo"); + public URI adr = createProperty(NS, "adr"); + + public URI street_address = createProperty(NS, "street-address"); + public URI extended_address = createProperty(NS, "extended-address"); + public URI locality = createProperty(NS, "locality"); + public URI region = createProperty(NS, "region"); + public URI postal_code = createProperty(NS, "postal-code"); + public URI country_name = createProperty(NS, "country-name"); + + public URI latitude = createProperty(NS, "latitude"); + public URI longitude = createProperty(NS, "longitude"); + public URI altitude = createProperty(NS, "altitude"); + + private HCard() { + super(NS); + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractor.java new file mode 100644 index 000000000..ebdd77b96 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractor.java @@ -0,0 +1,450 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.TagSoupExtractionResult; +import org.apache.any23.extractor.html.HTMLDocument; +import org.apache.any23.vocab.HCard; +import org.apache.any23.vocab.VCard; +import org.openrdf.model.BNode; +import org.openrdf.model.Resource; +import org.openrdf.model.URI; +import org.openrdf.model.vocabulary.RDF; +import org.w3c.dom.Node; +import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; +import org.apache.any23.extractor.html.DomUtils; + +import java.util.List; + + +/** + * Extractor for the h-Card + * microformat. + * + * @author Nisala Nirmana + */ +public class HCardExtractor extends EntityBasedMicroformatExtractor { + + private static final HCard vCARD = HCard.getInstance(); + + private static final String[] cardFields = { + "name", + "honorific-prefix", + "given-name", + "additional-name", + "family-name", + "sort-string", + "honorific-suffix", + "nickname", + "email", + "logo", + "photo", + "url", + "uid", + "category", + "tel", + "note", + "bday", + "key", + "org", + "job-title", + "role", + "impp", + "sex", + "gender-identity", + "anniversary", + "adr", + "geo" + }; + + private static final String[] addressFields = { + "street-address", + "extended-address", + "locality", + "region", + "postal-code", + "country-name", + "geo" + }; + + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" + }; + + + + @Override + public ExtractorDescription getDescription() { + return HCardExtractorFactory.getDescriptionInstance(); + } + + @Override + protected String getBaseClassName() { + return Microformats2Prefixes.CLASS_PREFIX+"card"; + } + + @Override + protected void resetExtractor() { + //empty + } + + @Override + protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException { + final BNode card = getBlankNodeFor(node); + conditionallyAddResourceProperty(card, RDF.TYPE, vCARD.Card); + final HTMLDocument fragment = new HTMLDocument(node); + addName(fragment, card); + addHonorificPrefix(fragment, card); + addGivenName(fragment, card); + addAdditionalName(fragment, card); + addFamilyName(fragment, card); + addSortString(fragment, card); + addHonorificSuffix(fragment, card); + addNickname(fragment, card); + addEmails(fragment, card); + addLogo(fragment, card); + addPhoto(fragment, card); + addURLs(fragment, card); + addUID(fragment, card); + addCategories(fragment, card); + addTelephones(fragment, card); + addNotes(fragment, card); + addBday(fragment, card); + addKey(fragment, card); + addOrg(fragment, card); + addJobTitle(fragment, card); + addRole(fragment, card); + addImpp(fragment, card); + addSex(fragment, card); + addGenderIdentity(fragment, card); + addAnniversary(fragment, card); + addGeo(fragment, card); + addAdr(fragment, card); + final TagSoupExtractionResult tser = (TagSoupExtractionResult) out; + tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, this.getClass() ); + return true; + } + + public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode card, + ExtractionResult out) + throws ExtractionException { + this.setCurrentExtractionResult(out); + addName(fragment, card); + addHonorificPrefix(fragment, card); + addGivenName(fragment, card); + addAdditionalName(fragment, card); + addFamilyName(fragment, card); + addSortString(fragment, card); + addHonorificSuffix(fragment, card); + addNickname(fragment, card); + addEmails(fragment, card); + addLogo(fragment, card); + addPhoto(fragment, card); + addURLs(fragment, card); + addUID(fragment, card); + addCategories(fragment, card); + addTelephones(fragment, card); + addNotes(fragment, card); + addBday(fragment, card); + addKey(fragment, card); + addOrg(fragment, card); + addJobTitle(fragment, card); + addRole(fragment, card); + addImpp(fragment, card); + addSex(fragment, card); + addGenderIdentity(fragment, card); + addAnniversary(fragment, card); + addGeo(fragment, card); + addAdr(fragment, card); + return card; + } + + + + private void mapFieldWithProperty(HTMLDocument fragment, BNode card, String fieldClass, + URI property) { + HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); + conditionallyAddStringProperty( + title.source(), card, property, title.value() + ); + } + + private void addName(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[0], vCARD.name); + } + + private void addHonorificPrefix(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[1], vCARD.honorific_prefix); + } + + private void addGivenName(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[2], vCARD.given_name); + } + + private void addAdditionalName(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[3], vCARD.additional_name); + } + + private void addFamilyName(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[4], vCARD.family_name); + } + + private void addSortString(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[5], vCARD.sort_string); + } + + private void addHonorificSuffix(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[6], vCARD.honorific_suffix); + } + + private void addNickname(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[7], vCARD.nickname); + } + + private void addEmails(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField[] emails = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[8]); + for(HTMLDocument.TextField email : emails) { + addURIProperty(card, vCARD.email, fragment.resolveURI(email.value())); + + } + } + + private void addLogo(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField logo = fragment.getSingularUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[9]); + if(logo.source()==null) + return; + addURIProperty(card, vCARD.logo, fragment.resolveURI(logo.value())); + } + + private void addPhoto(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField photo = fragment.getSingularUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[10]); + if(photo.source()==null) + return; + addURIProperty(card, vCARD.photo, fragment.resolveURI(photo.value())); + } + + private void addURLs(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField[] urls = fragment.getPluralUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[11]); + for(HTMLDocument.TextField url : urls) { + addURIProperty(card, vCARD.url, fragment.resolveURI(url.value())); + + } + } + + private void addUID(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField uid = fragment.getSingularUrlField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[12]); + if(uid.source()==null) + return; + addURIProperty(card, vCARD.uid, fragment.resolveURI(uid.value())); + } + + + private void addCategories(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] categories = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + cardFields[13]); + for (HTMLDocument.TextField category : categories) { + conditionallyAddStringProperty( + category.source(), entry, vCARD.category, category.value() + ); + } + } + + private void addTelephones(HTMLDocument fragment, BNode card) { + final HTMLDocument.TextField[] telephones = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + cardFields[14]); + for (HTMLDocument.TextField tel : telephones) { + Node attribute=tel.source().getAttributes().getNamedItem("value"); + if (attribute==null){ + conditionallyAddStringProperty( + tel.source(), card, vCARD.tel, tel.value() + ); + }else{ + conditionallyAddStringProperty( + tel.source(), card, vCARD.tel, attribute.getNodeValue() + ); + } + } + } + + private void addNotes(HTMLDocument fragment, BNode entry) { + final HTMLDocument.TextField[] categories = fragment.getPluralTextField + (Microformats2Prefixes.PROPERTY_PREFIX + cardFields[15]); + for (HTMLDocument.TextField category : categories) { + conditionallyAddStringProperty( + category.source(), entry, vCARD.note, category.value() + ); + } + } + + private void addBday(HTMLDocument fragment, BNode card) { + final HTMLDocument.TextField bday = fragment.getSingularTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[16]); + if (bday.source() == null) + return; + + Node attribute = bday.source().getAttributes().getNamedItem("datetime"); + if (attribute == null) { + conditionallyAddStringProperty( + bday.source(), + card, vCARD.bday, bday.value() + ); + } else { + conditionallyAddStringProperty( + bday.source(), + card, vCARD.bday, attribute.getNodeValue() + ); + + } + } + + private void addKey(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField uid = fragment.getSingularTextField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[17]); + if(uid.source()==null) + return; + addURIProperty(card, vCARD.key, fragment.resolveURI(uid.value())); + } + + private void addOrg(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[18], vCARD.org); + } + + private void addJobTitle(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[19], vCARD.job_title); + } + + private void addRole(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[20], vCARD.role); + } + + private void addImpp(HTMLDocument fragment, BNode card) throws ExtractionException { + final HTMLDocument.TextField impp = fragment.getSingularTextField + (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[21]); + if(impp.source()==null) + return; + addURIProperty(card, vCARD.impp, fragment.resolveURI(impp.value())); + } + + private void addSex(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[22], vCARD.sex); + } + + private void addGenderIdentity(HTMLDocument fragment, BNode card) { + mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + + cardFields[23], vCARD.gender_identity); + } + + + private void addAnniversary(HTMLDocument fragment, BNode card) { + final HTMLDocument.TextField anniversary = fragment.getSingularTextField( + Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[24]); + if (anniversary.source() == null) + return; + + Node attribute = anniversary.source().getAttributes().getNamedItem("datetime"); + if (attribute == null) { + conditionallyAddStringProperty( + anniversary.source(), + card, vCARD.bday, anniversary.value() + ); + } else { + conditionallyAddStringProperty( + anniversary.source(), + card, vCARD.bday, attribute.getNodeValue() + ); + + } + } + + private void addAdr(HTMLDocument doc, Resource card) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[25] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[25]); + if (nodes.isEmpty()) + return; + for (Node node : nodes) { + BNode location = valueFactory.createBNode(); + addURIProperty(location, RDF.TYPE, vCARD.Address); + HTMLDocument fragment = new HTMLDocument(node); + for (String field : addressFields) { + HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); + for (HTMLDocument.TextField val : values) { + if(!field.equals("geo")) { + conditionallyAddStringProperty( + val.source(), + location, vCARD.getProperty(field), val.value() + ); + }else { + addGeo(new HTMLDocument(node),card); + } + } + } + } + } + + private void addGeo(HTMLDocument doc, Resource card) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[26] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[26]); + if (nodes.isEmpty()) + return; + for (Node node : nodes) { + BNode location = valueFactory.createBNode(); + addURIProperty(location, RDF.TYPE, vCARD.Geo); + HTMLDocument fragment = new HTMLDocument(node); + for (String field : geoFields) { + HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); + for (HTMLDocument.TextField val : values) { + Node attribute=val.source().getAttributes().getNamedItem("title"); + if (attribute==null){ + conditionallyAddStringProperty( + val.source(), + location, vCARD.getProperty(field), val.value() + ); + }else{ + conditionallyAddStringProperty( + val.source(), + location, vCARD.getProperty(field), attribute.getNodeValue() + ); + } + } + } + } + } + +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractorFactory.java new file mode 100644 index 000000000..5a7d63e9b --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HCardExtractorFactory.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.PopularPrefixes; +import org.apache.any23.rdf.Prefixes; + +/** + * @author Nisala Nirmana + * + */ +public class HCardExtractorFactory extends SimpleExtractorFactory implements + ExtractorFactory { + + public static final String NAME = "html-mf2-h-card"; + + public static final Prefixes PREFIXES = PopularPrefixes.createSubset("rdf", "hcard"); + + private static final ExtractorDescription descriptionInstance = new HCardExtractorFactory(); + + public HCardExtractorFactory() { + super( + HCardExtractorFactory.NAME, + HCardExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), + "example-mf2-h-card.html"); + } + + @Override + public HCardExtractor createExtractor() { + return new HCardExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java index 8c0c50f79..3a85b5b5b 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEntryExtractor.java @@ -55,7 +55,7 @@ public class HEntryExtractor extends EntityBasedMicroformatExtractor { "uid", "syndication", "in-reply-to", - "author", //toDo HCard + "author", "location", }; @@ -96,10 +96,26 @@ protected boolean extractEntity(Node node, ExtractionResult out) throws Extracti addUID(fragment, entry); addSyndications(fragment, entry); addInReplyTo(fragment, entry); - addLocations(fragment,entry); + addLocations(fragment, entry); + addAuthors(fragment, entry); return true; } + private void addAuthors(HTMLDocument doc, Resource entry) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[10] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode author = valueFactory.createBNode(); + addURIProperty(author, RDF.TYPE, vEntry.author); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), author, + getCurrentExtractionResult()); + } + } + private void mapFieldWithProperty(HTMLDocument fragment, BNode entry, String fieldClass, URI property) { HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java index ea907161d..3f4d817de 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java @@ -23,6 +23,7 @@ import org.apache.any23.extractor.TagSoupExtractionResult; import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor; import org.apache.any23.vocab.HEvent; +import org.apache.any23.vocab.VCard; import org.openrdf.model.BNode; import org.openrdf.model.Resource; import org.openrdf.model.URI; @@ -30,6 +31,8 @@ import org.w3c.dom.Node; import org.apache.any23.extractor.html.HTMLDocument; +import java.util.List; + import static org.apache.any23.extractor.html.HTMLDocument.TextField; @@ -42,6 +45,7 @@ public class HEventExtractor extends EntityBasedMicroformatExtractor { private static final HEvent vEvent = HEvent.getInstance(); + private static final VCard vVCARD = VCard.getInstance(); private String[] eventFields = { "name", @@ -52,8 +56,14 @@ public class HEventExtractor extends EntityBasedMicroformatExtractor { "description", "url", "category", - "location", //toDO - "attendee" //toDO + "location", + "attendee" + }; + + private static final String[] geoFields = { + "latitude", + "longitude", + "altitude" }; @@ -85,7 +95,7 @@ protected boolean extractEntity(Node node, ExtractionResult out) throws Extracti addDescription(fragment, event); addURLs(fragment, event); addCategories(fragment, event); - addLocation(fragment, event); + addLocations(fragment, event); return true; } @@ -102,10 +112,26 @@ public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode eve addDescription(fragment, event); addURLs(fragment, event); addCategories(fragment, event); - addLocation(fragment, event); + addLocations(fragment, event); + addAttendees(fragment,event); return event; } + private void addAttendees(HTMLDocument doc, Resource entry) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[9] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode attendee = valueFactory.createBNode(); + addURIProperty(attendee, RDF.TYPE, vEvent.attendee); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), attendee, + getCurrentExtractionResult()); + } + } + private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, URI property) { HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass); @@ -204,9 +230,33 @@ private void addCategories(HTMLDocument fragment, BNode event) { } } - private void addLocation(HTMLDocument fragment, BNode event) { - mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + - eventFields[8], vEvent.location); + private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[8] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "geo"); + if (nodes.isEmpty()) + return; + for (Node node : nodes) { + BNode location = valueFactory.createBNode(); + addURIProperty(location, RDF.TYPE, vEvent.location); + HTMLDocument fragment = new HTMLDocument(node); + for (String field : geoFields) { + HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field); + for (HTMLDocument.TextField val : values) { + Node attribute=val.source().getAttributes().getNamedItem("title"); + if (attribute==null){ + conditionallyAddStringProperty( + val.source(), + location, vVCARD.getProperty(field), val.value() + ); + }else{ + conditionallyAddStringProperty( + val.source(), + location, vVCARD.getProperty(field), attribute.getNodeValue() + ); + } + } + } + } } } diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java index 0e93935b2..0673a1d0b 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HProductExtractor.java @@ -24,10 +24,13 @@ import org.apache.any23.extractor.html.HTMLDocument; import org.apache.any23.vocab.HProduct; import org.openrdf.model.BNode; +import org.openrdf.model.Resource; import org.openrdf.model.URI; import org.openrdf.model.vocabulary.RDF; import org.w3c.dom.Node; +import java.util.List; + /** * Extractor for the h-product * microformat. @@ -41,7 +44,7 @@ public class HProductExtractor extends EntityBasedMicroformatExtractor { private static final String[] productFields = { "name", "photo", - "brand", //toDo + "brand", "category", "description", "url", @@ -77,6 +80,7 @@ protected boolean extractEntity(Node node, ExtractionResult out) throws Extracti addURLs(fragment, product); addIdentifiers(fragment, product); addPrice(fragment, product); + addBrand(fragment,product); return true; } @@ -150,4 +154,19 @@ private void addPrice(HTMLDocument fragment, BNode product) { ); } } + + private void addBrand(HTMLDocument doc, Resource product) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + productFields[2] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode brand = valueFactory.createBNode(); + addURIProperty(brand, RDF.TYPE, vProduct.brand); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), brand, + getCurrentExtractionResult()); + } + } } diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java index 44b463dba..202621992 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java @@ -21,10 +21,7 @@ import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.TagSoupExtractionResult; -import org.apache.any23.vocab.DOAC; -import org.apache.any23.vocab.FOAF; import org.apache.any23.vocab.HResume; -import org.apache.commons.lang.UnhandledException; import org.openrdf.model.BNode; import org.openrdf.model.Resource; import org.openrdf.model.vocabulary.RDF; @@ -47,11 +44,11 @@ public class HResumeExtractor extends EntityBasedMicroformatExtractor { private static final String[] resumeFields = { "name", "summary", - "contact",//toDo Hcard + "contact", "education", "experience", "skill", - "affiliation"//toDo Hcard + "affiliation" }; @Override @@ -94,6 +91,36 @@ protected boolean extractEntity(Node node, ExtractionResult out) throws Extracti return true; } + private void addContacts(HTMLDocument doc, Resource entry) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[2] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode contact = valueFactory.createBNode(); + addURIProperty(contact, RDF.TYPE, vResume.contact); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), contact, + getCurrentExtractionResult()); + } + } + + private void addAffiliations(HTMLDocument doc, Resource entry) throws ExtractionException { + List nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[6] + + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card"); + if (nodes.isEmpty()) + return; + HCardExtractorFactory factory = new HCardExtractorFactory(); + HCardExtractor extractor = factory.createExtractor(); + for (Node node : nodes) { + BNode affiliation = valueFactory.createBNode(); + addURIProperty(affiliation, RDF.TYPE, vResume.affiliation); + extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), affiliation, + getCurrentExtractionResult()); + } + } + private void addName(HTMLDocument doc, Resource person) { HTMLDocument.TextField name = doc.getSingularTextField( Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[0]); diff --git a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties index c7eaf545e..2f9183da0 100644 --- a/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties +++ b/core/src/main/resources/org/apache/any23/prefixes/prefixes.properties @@ -33,6 +33,7 @@ wo=http://purl.org/ontology/wo/ skos=http://www.w3.org/2004/02/skos/core# hrecipe=http://sindice.com/hrecipe/ hevent=http://sindice.com/hevent/ +hcard=http://sindice.com/hcard/ hproduct=http://sindice.com/hproduct/ hitem=http://sindice.com/hitem/ hentry=http://sindice.com/hentry/ diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java index 69abb5561..e857105a7 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HAdrExtractorTest.java @@ -34,4 +34,4 @@ public void testModelNotEmpty() throws RepositoryException , RDFHandlerException assertModelNotEmpty(); assertStatementsSize(null, null, null, 11); } -} +} \ No newline at end of file diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HCardExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HCardExtractorTest.java new file mode 100644 index 000000000..9c9dc0686 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HCardExtractorTest.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.html.microformats2; + +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.html.AbstractExtractorTestCase; +import org.junit.Test; +import org.openrdf.repository.RepositoryException; +import org.openrdf.rio.RDFHandlerException; + +public class HCardExtractorTest extends AbstractExtractorTestCase { + protected ExtractorFactory getExtractorFactory() { + return new HCardExtractorFactory(); + } + + @Test + public void testModelNotEmpty() throws RepositoryException , RDFHandlerException { + assertExtract("/microformats2/h-card/h-card-test.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 9); + } +} \ No newline at end of file diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java index cc2974d19..96f3a6e8f 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEntryExtractorTest.java @@ -32,6 +32,6 @@ protected ExtractorFactory getExtractorFactory() { public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { assertExtract("/microformats2/h-entry/h-entry-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 10); + assertStatementsSize(null, null, null, 20); } } \ No newline at end of file diff --git a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java index 6c1390980..70b212e8d 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/microformats2/HEventExtractorTest.java @@ -32,6 +32,6 @@ protected ExtractorFactory getExtractorFactory() { public void testModelNotEmpty() throws RepositoryException, RDFHandlerException { assertExtract("/microformats2/h-event/h-event-test.html"); assertModelNotEmpty(); - assertStatementsSize(null, null, null, 9); + assertStatementsSize(null, null, null, 8); } } diff --git a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java index c58e2a106..64fb4b7bf 100644 --- a/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java +++ b/core/src/test/java/org/apache/any23/vocab/RDFSchemaUtilsTest.java @@ -43,7 +43,7 @@ public class RDFSchemaUtilsTest { */ @Test public void testSerializeVocabulariesNTriples() { - serializeVocabularies(RDFFormat.NTRIPLES, 2012);//1920 + serializeVocabularies(RDFFormat.NTRIPLES, 2090); } /** @@ -53,7 +53,7 @@ public void testSerializeVocabulariesNTriples() { */ @Test public void testSerializeVocabulariesRDFXML() { - serializeVocabularies(RDFFormat.RDFXML, 5252); // Effective lines + separators. //4992 + serializeVocabularies(RDFFormat.RDFXML, 5453); // Effective lines + separators. } private void serializeVocabularies(RDFFormat format, int expectedLines) { diff --git a/test-resources/src/test/resources/microformats2/h-card/h-card-test.html b/test-resources/src/test/resources/microformats2/h-card/h-card-test.html new file mode 100644 index 000000000..f5ffb56a0 --- /dev/null +++ b/test-resources/src/test/resources/microformats2/h-card/h-card-test.html @@ -0,0 +1,45 @@ + + + + + + + +
+ + + + +

Joe Bloggs

+ +

+ +

Professional Profile

+ + + +

+ 17 Austerstræti + Reykjavík + Iceland +

+ + +
+ + + + diff --git a/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html index f3c8cf791..adc2a1a2c 100644 --- a/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html +++ b/test-resources/src/test/resources/microformats2/h-entry/h-entry-test.html @@ -23,6 +23,27 @@

March 25th, 2012

+
+ + + + +

Joe Bloggs

+ +

+ +

Professional Profile

+ + + +

+ 17 Austerstræti + Reykjavík + Iceland +

+ + +

Last week the microformats.org community celebrated its 7th birthday at a gathering hosted by Mozilla in From 6ad6d87585a566765b34a6fc5c226d54f55d56eb Mon Sep 17 00:00:00 2001 From: Nisala Date: Thu, 27 Aug 2015 01:06:57 +0530 Subject: [PATCH 09/10] non commited HCard embedded properties --- .../any23/extractor/html/microformats2/HEventExtractor.java | 2 +- .../any23/extractor/html/microformats2/HResumeExtractor.java | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java index 3f4d817de..67a476f83 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HEventExtractor.java @@ -96,7 +96,7 @@ protected boolean extractEntity(Node node, ExtractionResult out) throws Extracti addURLs(fragment, event); addCategories(fragment, event); addLocations(fragment, event); - + addAttendees(fragment,event); return true; } diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java index 202621992..06f4f3c06 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HResumeExtractor.java @@ -80,6 +80,8 @@ protected boolean extractEntity(Node node, ExtractionResult out) throws Extracti addExperiences(fragment, person); addEducations(fragment, person); + addAffiliations(fragment, person); + addContacts(fragment,person); final TagSoupExtractionResult tser = (TagSoupExtractionResult) out; tser.addResourceRoot( From 47571dda07a90c658c0fe202f3f3133b8c69dec9 Mon Sep 17 00:00:00 2001 From: Nisala Date: Thu, 27 Aug 2015 09:44:48 +0530 Subject: [PATCH 10/10] adding license headers for HItem extractors --- .../html/microformats2/HItemExtractor.java | 17 +++++++++++++++++ .../microformats2/HItemExtractorFactory.java | 17 +++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractor.java index 19ed75724..4478dc005 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractor.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.any23.extractor.html.microformats2; import org.apache.any23.extractor.ExtractionException; diff --git a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java index 14f20bdad..4064955c4 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/html/microformats2/HItemExtractorFactory.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.any23.extractor.html.microformats2; import java.util.Arrays;