Browse files

NUTCH-439 - Top Level Domains Indexing / Scoring. Contributed by Enis.

git-svn-id: https://svn.apache.org/repos/asf/lucene/nutch/trunk@568053 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information...
1 parent 5a30e60 commit 2865e0e16f3ff8d2f817a727098bf7c85ba4baa2 Tacettin Guney committed Aug 21, 2007
View
3 CHANGES.txt
@@ -117,6 +117,9 @@ Unreleased changes (1.0-dev)
39. NUTCH-536 - Reduce number of warnings in nutch core. (dogacan)
+40. NUTCH-439 - Top Level Domains Indexing / Scoring. Also adds
+ domain-related utilities. (Enis Soztutar via dogacan)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
View
4,354 conf/domain-suffixes.xml
4,354 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
130 conf/domain-suffixes.xsd
@@ -0,0 +1,130 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+ Document : domain-suffixes.xsd
+ Author : Enis Soztutar - enis.soz.nutch@gmail.com
+ Description: This document is the schema for valid domain-suffixes
+ definitions. For successful parsing of domain-suffixes xml files,
+ the xml file should be validated with this xsd.
+ See : org.apache.nutch.util.domain.DomainSuffixesReader.java
+-->
+
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ targetNamespace="http://lucene.apache.org/nutch"
+ xmlns="http://lucene.apache.org/nutch"
+ elementFormDefault="qualified">
+
+ <xs:element name="domains">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="tlds">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="itlds">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="tld" maxOccurs="unbounded"
+ type="gtld" />
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:element name="gtlds">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="tld" maxOccurs="unbounded"
+ type="gtld" />
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:element name="cctlds">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="tld" maxOccurs="unbounded"
+ type="cctld" />
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:element name="suffixes">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="suffix" maxOccurs="unbounded"
+ type="sldType" />
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:complexType name="gtld">
+ <xs:sequence>
+ <xs:element name="status" minOccurs="0">
+ <xs:simpleType>
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="INFRASTRUCTURE" />
+ <xs:enumeration value="SPONSORED" />
+ <xs:enumeration value="UNSPONSORED" />
+ <xs:enumeration value="STARTUP" />
+ <xs:enumeration value="PROPOSED" />
+ <xs:enumeration value="DELETED" />
+ <xs:enumeration value="PSEUDO_DOMAIN" />
+ </xs:restriction>
+ </xs:simpleType>
+ </xs:element>
+ <xs:element name="boost" type="xs:float" minOccurs="0" />
+ <xs:element name="description" type="xs:string" minOccurs="0" />
+ </xs:sequence>
+ <xs:attribute name="domain" type="xs:string" />
+ </xs:complexType>
+
+ <xs:complexType name="cctld">
+ <xs:sequence>
+ <xs:element name="country" type="xs:string" />
+ <xs:element name="status" type="statusType" minOccurs="0" />
+ <xs:element name="boost" type="xs:float" minOccurs="0" />
+ <xs:element name="description" type="xs:string" minOccurs="0" />
+ </xs:sequence>
+ <xs:attribute name="domain" type="xs:string" />
+ </xs:complexType>
+
+ <xs:complexType name="sldType">
+ <xs:sequence>
+ <xs:element name="status" type="statusType" minOccurs="0" />
+ <xs:element name="boost" type="xs:float" minOccurs="0" />
+ <xs:element name="description" type="xs:string" minOccurs="0" />
+ </xs:sequence>
+ <xs:attribute name="domain" type="xs:string" />
+ </xs:complexType>
+
+ <xs:simpleType name="statusType">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="IN_USE" />
+ <xs:enumeration value="NOT_IN_USE" />
+ <xs:enumeration value="DELETED" />
+ </xs:restriction>
+ </xs:simpleType>
+
+</xs:schema>
View
160 src/java/org/apache/nutch/util/URLUtil.java
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.regex.Pattern;
+
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+/** Utility class for URL analysis */
+public class URLUtil {
+
+ private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
+
+ /** Returns the domain name of the url. The domain name of a url is
+ * the substring of the url's hostname, w/o subdomain names. As an
+ * example <br><code>
+ * getDomainName(conf, new URL(http://lucene.apache.org/))
+ * </code><br>
+ * will return <br><code> apache.org</code>
+ * */
+ public static String getDomainName(URL url) {
+ DomainSuffixes tlds = DomainSuffixes.getInstance();
+ String host = url.getHost();
+ //it seems that java returns hostnames ending with .
+ if(host.endsWith("."))
+ host = host.substring(0, host.length() - 1);
+ if(IP_PATTERN.matcher(host).matches())
+ return host;
+
+ int index = 0;
+ String candidate = host;
+ for(;index >= 0;) {
+ index = candidate.indexOf('.');
+ String subCandidate = candidate.substring(index+1);
+ if(tlds.isDomainSuffix(subCandidate)) {
+ return candidate;
+ }
+ candidate = subCandidate;
+ }
+ return candidate;
+ }
+
+ /** Returns the domain name of the url. The domain name of a url is
+ * the substring of the url's hostname, w/o subdomain names. As an
+ * example <br><code>
+ * getDomainName(conf, new http://lucene.apache.org/)
+ * </code><br>
+ * will return <br><code> apache.org</code>
+ * @throws MalformedURLException
+ */
+ public static String getDomainName(String url) throws MalformedURLException {
+ return getDomainName(new URL(url));
+ }
+
+ /** Returns whether the given urls have the same domain name.
+ * As an example, <br>
+ * <code> isSameDomain(new URL("http://lucene.apache.org")
+ * , new URL("http://people.apache.org/"))
+ * <br> will return true. </code>
+ *
+ * @return true if the domain names are equal
+ */
+ public static boolean isSameDomainName(URL url1, URL url2) {
+ return getDomainName(url1).equalsIgnoreCase(getDomainName(url2));
+ }
+
+ /**Returns whether the given urls have the same domain name.
+ * As an example, <br>
+ * <code> isSameDomain("http://lucene.apache.org"
+ * ,"http://people.apache.org/")
+ * <br> will return true. </code>
+ * @return true if the domain names are equal
+ * @throws MalformedURLException
+ */
+ public static boolean isSameDomainName(String url1, String url2)
+ throws MalformedURLException {
+ return isSameDomainName(new URL(url1), new URL(url2));
+ }
+
+ /** Returns the {@link DomainSuffix} corresponding to the
+ * last public part of the hostname
+ */
+ public static DomainSuffix getDomainSuffix(URL url) {
+ DomainSuffixes tlds = DomainSuffixes.getInstance();
+ String host = url.getHost();
+ if(IP_PATTERN.matcher(host).matches())
+ return null;
+
+ int index = 0;
+ String candidate = host;
+ for(;index >= 0;) {
+ index = candidate.indexOf('.');
+ String subCandidate = candidate.substring(index+1);
+ DomainSuffix d = tlds.get(subCandidate);
+ if(d != null) {
+ return d;
+ }
+ candidate = subCandidate;
+ }
+ return null;
+ }
+
+ /** Returns the {@link DomainSuffix} corresponding to the
+ * last public part of the hostname
+ */
+ public static DomainSuffix getDomainSuffix(String url) throws MalformedURLException {
+ return getDomainSuffix(new URL(url));
+ }
+
+ /** Partitions of the hostname of the url by "." */
+ public static String[] getHostSegments(URL url) {
+ String host = url.getHost();
+ //return whole hostname, if it is an ipv4
+ //TODO : handle ipv6
+ if(IP_PATTERN.matcher(host).matches())
+ return new String[] {host};
+ return host.split("\\.");
+ }
+
+ /** Partitions of the hostname of the url by "."
+ * @throws MalformedURLException */
+ public static String[] getHostSegments(String url) throws MalformedURLException {
+ return getHostSegments(new URL(url));
+ }
+
+ /** For testing */
+ public static void main(String[] args){
+
+ if(args.length!=1) {
+ System.err.println("Usage : URLUtil <url>");
+ return ;
+ }
+
+ String url = args[0];
+ try {
+ System.out.println(URLUtil.getDomainName(new URL(url)));
+ }
+ catch (MalformedURLException ex) {
+ ex.printStackTrace();
+ }
+ }
+}
View
79 src/java/org/apache/nutch/util/domain/DomainSuffix.java
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+/**
+ * This class represents the last part of the host name,
+ * which is operated by authoritives, not individuals. This information
+ * is needed to find the domain name of a host. The domain name of a host
+ * is defined to be the last part before the domain suffix, w/o subdomain
+ * names. As an example the domain name of <br><code> http://lucene.apache.org/
+ * </code><br> is <code> apache.org</code>
+ * <br>
+ * This class holds three fields,
+ * <strong>domain</strong> field represents the suffix (such as "co.uk")
+ * <strong>boost</strong> is a float for boosting score of url's with this suffix
+ * <strong>status</strong> field represents domain's status
+ *
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ * @see TopLevelDomain
+ * @see domain-suffixes.xml
+ */
+public class DomainSuffix {
+
+ /**
+ * Enumeration of the status of the tld. Please see domain-suffixes.xml.
+ */
+ public enum Status { INFRASTRUCTURE, SPONSORED, UNSPONSORED
+ , STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED
+ };
+
+ private String domain;
+ private Status status;
+ private float boost;
+
+ public static final float DEFAULT_BOOST = 1.0f;
+ public static final Status DEFAULT_STATUS = Status.IN_USE;
+
+ public DomainSuffix(String domain, Status status, float boost) {
+ this.domain = domain;
+ this.status = status;
+ this.boost = boost;
+ }
+
+ public DomainSuffix(String domain) {
+ this(domain, DEFAULT_STATUS, DEFAULT_BOOST);
+ }
+
+ public String getDomain() {
+ return domain;
+ }
+
+ public Status getStatus() {
+ return status;
+ }
+
+ public float getBoost() {
+ return boost;
+ }
+
+ @Override
+ public String toString() {
+ return domain;
+ }
+}
View
81 src/java/org/apache/nutch/util/domain/DomainSuffixes.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.InputStream;
+import java.util.HashMap;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.StringUtils;
+
+/**
+ * Storage class for <code>DomainSuffix</code> objects
+ * Note: this class is singleton
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class DomainSuffixes {
+ private static final Log LOG = LogFactory.getLog(DomainSuffixes.class);
+
+ private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>();
+
+ private static DomainSuffixes instance;
+
+ /** private ctor */
+ private DomainSuffixes() {
+ String file = "domain-suffixes.xml";
+ InputStream input = this.getClass().getClassLoader().getResourceAsStream(file);
+ try {
+ new DomainSuffixesReader().read(this, input);
+ }
+ catch (Exception ex) {
+ LOG.warn(StringUtils.stringifyException(ex));
+ }
+ }
+
+ /**
+ * Singleton instance, lazy instantination
+ * @return
+ */
+ public static DomainSuffixes getInstance() {
+ if(instance == null) {
+ instance = new DomainSuffixes();
+ }
+ return instance;
+ }
+
+ void addDomainSuffix(DomainSuffix tld) {
+ domains.put(tld.getDomain(), tld);
+ }
+
+ /** return whether the extension is a registered domain entry */
+ public boolean isDomainSuffix(String extension) {
+ return domains.containsKey(extension);
+ }
+
+ /**
+ * Return the {@link DomainSuffix} object for the extension, if
+ * extension is a top level domain returned object will be an
+ * instance of {@link TopLevelDomain}
+ * @param extension of the domain
+ */
+ public DomainSuffix get(String extension) {
+ return domains.get(extension);
+ }
+
+}
View
159 src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
@@ -0,0 +1,159 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.nutch.util.domain.DomainSuffix.Status;
+import org.apache.nutch.util.domain.TopLevelDomain.Type;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+/**
+ * For parsing xml files containing domain suffix definitions.
+ * Parsed xml files should validate against
+ * <code>domain-suffixes.xsd</code>
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+class DomainSuffixesReader {
+
+ private static final Log LOG = LogFactory.getLog(DomainSuffixesReader.class);
+
+ void read(DomainSuffixes tldEntries, InputStream input) throws IOException{
+ try {
+
+ DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+ factory.setIgnoringComments(true);
+ DocumentBuilder builder = factory.newDocumentBuilder();
+ Document document = builder.parse(new InputSource(input));
+
+ Element root = document.getDocumentElement();
+
+ if(root != null && root.getTagName().equals("domains")) {
+
+ Element tlds = (Element)root.getElementsByTagName("tlds").item(0);
+ Element suffixes = (Element)root.getElementsByTagName("suffixes").item(0);
+
+ //read tlds
+ readITLDs(tldEntries, (Element)tlds.getElementsByTagName("itlds").item(0));
+ readGTLDs(tldEntries, (Element)tlds.getElementsByTagName("gtlds").item(0));
+ readCCTLDs(tldEntries, (Element)tlds.getElementsByTagName("cctlds").item(0));
+
+ readSuffixes(tldEntries, suffixes);
+ }
+ else {
+ throw new IOException("xml file is not valid");
+ }
+ }
+ catch (ParserConfigurationException ex) {
+ LOG.warn(StringUtils.stringifyException(ex));
+ throw new IOException(ex.getMessage());
+ }
+ catch (SAXException ex) {
+ LOG.warn(StringUtils.stringifyException(ex));
+ throw new IOException(ex.getMessage());
+ }
+ }
+
+ void readITLDs(DomainSuffixes tldEntries, Element el) {
+ NodeList children = el.getElementsByTagName("tld");
+ for(int i=0;i<children.getLength();i++) {
+ tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.INFRASTRUCTURE));
+ }
+ }
+
+ void readGTLDs(DomainSuffixes tldEntries, Element el) {
+ NodeList children = el.getElementsByTagName("tld");
+ for(int i=0;i<children.getLength();i++) {
+ tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.GENERIC));
+ }
+ }
+
+ void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException {
+ NodeList children = el.getElementsByTagName("tld");
+ for(int i=0;i<children.getLength();i++) {
+ tldEntries.addDomainSuffix(readCCTLD((Element)children.item(i)));
+ }
+ }
+
+ TopLevelDomain readGTLD(Element el, Type type) {
+ String domain = el.getAttribute("domain");
+ Status status = readStatus(el);
+ float boost = readBoost(el);
+ return new TopLevelDomain(domain, type, status, boost);
+ }
+
+ TopLevelDomain readCCTLD(Element el) throws IOException {
+ String domain = el.getAttribute("domain");
+ Status status = readStatus(el);
+ float boost = readBoost(el);
+ String countryName = readCountryName(el);
+ return new TopLevelDomain(domain, status, boost, countryName);
+ }
+
+ /** read optional field status */
+ Status readStatus(Element el) {
+ NodeList list = el.getElementsByTagName("status");
+ if(list == null || list.getLength() == 0)
+ return DomainSuffix.DEFAULT_STATUS;
+ return Status.valueOf(list.item(0).getFirstChild().getNodeValue());
+ }
+
+ /** read optional field boost */
+ float readBoost(Element el) {
+ NodeList list = el.getElementsByTagName("boost");
+ if(list == null || list.getLength() == 0)
+ return DomainSuffix.DEFAULT_BOOST;
+ return Float.parseFloat(list.item(0).getFirstChild().getNodeValue());
+ }
+
+ /** read field countryname
+ */
+ String readCountryName(Element el) throws IOException {
+ NodeList list = el.getElementsByTagName("country");
+ if(list == null || list.getLength() == 0)
+ throw new IOException("Country name should be given");
+ return list.item(0).getNodeValue();
+ }
+
+ void readSuffixes(DomainSuffixes tldEntries, Element el) {
+ NodeList children = el.getElementsByTagName("suffix");
+ for(int i=0;i<children.getLength();i++) {
+ tldEntries.addDomainSuffix(readSuffix((Element)children.item(i)));
+ }
+ }
+
+ DomainSuffix readSuffix(Element el) {
+ String domain = el.getAttribute("domain");
+ Status status = readStatus(el);
+ float boost = readBoost(el);
+ return new DomainSuffix(domain, status, boost);
+ }
+
+}
View
58 src/java/org/apache/nutch/util/domain/TopLevelDomain.java
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util.domain;
+
+/**
+ * (From wikipedia) A top-level domain (TLD) is the last part of an
+ * Internet domain name; that is, the letters which follow the final
+ * dot of any domain name. For example, in the domain name
+ * <code>www.website.com</code>, the top-level domain is <code>com</code>.
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ * @see http://www.iana.org/
+ * @see http://en.wikipedia.org/wiki/Top-level_domain
+ */
+public class TopLevelDomain extends DomainSuffix {
+
+ public enum Type { INFRASTRUCTURE, GENERIC, COUNTRY };
+
+ private Type type;
+ private String countryName = null;
+
+ public TopLevelDomain(String domain, Type type, Status status, float boost){
+ super(domain, status, boost);
+ this.type = type;
+ }
+
+ public TopLevelDomain(String domain, Status status, float boost, String countryName){
+ super(domain, status, boost);
+ this.type = Type.COUNTRY;
+ this.countryName = countryName;
+ }
+
+ public Type getType() {
+ return type;
+ }
+
+ /** Returns the country name if TLD is Country Code TLD
+ * @return country name or null
+ */
+ public String getCountryName(){
+ return countryName;
+ }
+
+}
View
16 src/java/org/apache/nutch/util/domain/package.html
@@ -0,0 +1,16 @@
+<html>
+<body>
+<h2> org.apache.nutch.util.domain</h2>
+
+<p>This package contains classes for domain analysis.</p>
+
+for information please refer to following urls :
+<ul>
+<li><a href="http://en.wikipedia.org/wiki/DNS">http://en.wikipedia.org/wiki/DNS</a></li>
+<li><a href="http://en.wikipedia.org/wiki/Top-level_domain">http://en.wikipedia.org/wiki/Top-level_domain</a></li>
+<li><a href="http://wiki.mozilla.org/TLD_List">http://wiki.mozilla.org/TLD_List</a></li>
+<li><a href="http://publicsuffix.org/">http://publicsuffix.org/</a></li>
+</ul>
+
+</body>
+</html>
View
2 src/plugin/build.xml
@@ -68,6 +68,7 @@
<ant dir="summary-basic" target="deploy"/>
<ant dir="subcollection" target="deploy"/>
<ant dir="summary-lucene" target="deploy"/>
+ <ant dir="tld" target="deploy"/>
<ant dir="urlfilter-automaton" target="deploy"/>
<ant dir="urlfilter-prefix" target="deploy"/>
<ant dir="urlfilter-regex" target="deploy"/>
@@ -158,6 +159,7 @@
<ant dir="subcollection" target="clean"/>
<ant dir="summary-basic" target="clean"/>
<ant dir="summary-lucene" target="clean"/>
+ <ant dir="tld" target="clean"/>
<ant dir="urlfilter-automaton" target="clean"/>
<ant dir="urlfilter-prefix" target="clean"/>
<ant dir="urlfilter-regex" target="clean"/>
View
22 src/plugin/tld/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="tld" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
View
51 src/plugin/tld/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="tld"
+ name="Top Level Domain Plugin"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+
+ <runtime>
+ <library name="tld.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.tld"
+ name="Top Level Domain Indexing Filter"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="TLDIndexingFilter"
+ class="org.apache.nutch.indexer.tld.TLDIndexingFilter"/>
+ </extension>
+
+ <extension id="org.apache.nutch.scoring.tld"
+ name="Top Level Domain Scoring Filter"
+ point="org.apache.nutch.scoring.ScoringFilter">
+
+ <implementation id="org.apache.nutch.scoring.tld.TLDScoringFilter"
+ class="org.apache.nutch.scoring.tld.TLDScoringFilter" />
+ </extension>
+
+
+</plugin>
View
69 src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.tld;
+
+import java.net.URL;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * Adds the Top level domain extensions to the index
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDIndexingFilter implements IndexingFilter {
+ public static final Log LOG = LogFactory.getLog(TLDIndexingFilter.class);
+
+ private Configuration conf;
+
+ public Document filter(Document doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
+ throws IndexingException {
+
+ try {
+ URL url = new URL(urlText.toString());
+ DomainSuffix d = URLUtil.getDomainSuffix(url);
+
+ // store, no index
+ doc.add(new Field("tld", d.getDomain(), Field.Store.YES, Field.Index.NO));
+
+ }catch (Exception ex) {
+ LOG.warn(ex);
+ }
+
+ return doc;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+}
View
5 src/plugin/tld/src/java/org/apache/nutch/indexer/tld/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Indexing plugin.</p><p></p>
+</body>
+</html>
View
113 src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
@@ -0,0 +1,113 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.tld;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.document.Document;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+
+/**
+ * Scoring filter to boost tlds.
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDScoringFilter implements ScoringFilter {
+
+ private Configuration conf;
+ private DomainSuffixes tldEntries;
+
+ public TLDScoringFilter() {
+ tldEntries = DomainSuffixes.getInstance();
+ }
+
+ public float indexerScore(Text url, Document doc, CrawlDatum dbDatum,
+ CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+ throws ScoringFilterException {
+
+ String[] tlds = doc.getValues("tld");
+ float boost = 1.0f;
+
+ if(tlds != null) {
+ for(String tld : tlds) {
+ DomainSuffix entry = tldEntries.get(tld);
+ if(entry != null)
+ boost *= entry.getBoost();
+ }
+ }
+ return initScore * boost;
+ }
+
+ public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl,
+ ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount,
+ int validCount) throws ScoringFilterException {
+ return adjust;
+ }
+
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+ throws ScoringFilterException {
+ return initSort;
+ }
+
+ public void initialScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ }
+
+ public void injectedScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
+ }
+
+ public void passScoreAfterParsing(Text url, Content content, Parse parse)
+ throws ScoringFilterException {
+ }
+
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+ throws ScoringFilterException {
+ }
+
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+ List inlinked) throws ScoringFilterException {
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
+ Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
+ int allCount) throws ScoringFilterException {
+ return adjust;
+ }
+
+}
View
5 src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Scoring plugin.</p><p></p>
+</body>
+</html>
View
163 src/test/org/apache/nutch/util/TestURLUtil.java
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.util;
+
+import java.net.URL;
+
+import junit.framework.TestCase;
+
+/** Test class for URLUtil */
+public class TestURLUtil extends TestCase {
+
+ @Override
+ protected void setUp() throws Exception {
+ super.setUp();
+ }
+
+ public void testGetDomainName() throws Exception{
+
+ URL url = null;
+
+ url = new URL("http://lucene.apache.org/nutch");
+ assertEquals("apache.org", URLUtil.getDomainName(url));
+
+ url = new URL("http://en.wikipedia.org/wiki/Java_coffee");
+ assertEquals("wikipedia.org", URLUtil.getDomainName(url));
+
+ url = new URL("http://140.211.11.130/foundation/contributing.html");
+ assertEquals("140.211.11.130", URLUtil.getDomainName(url));
+
+ url = new URL("http://www.example.co.uk:8080/index.html");
+ assertEquals("example.co.uk", URLUtil.getDomainName(url));
+
+ url = new URL("http://com");
+ assertEquals("com", URLUtil.getDomainName(url));
+
+ url = new URL("http://www.example.co.uk.com");
+ assertEquals("uk.com", URLUtil.getDomainName(url));
+
+ //"nn" is not a tld
+ url = new URL("http://example.com.nn");
+ assertEquals("nn", URLUtil.getDomainName(url));
+
+ url = new URL("http://");
+ assertEquals("", URLUtil.getDomainName(url));
+
+ url = new URL("http://www.edu.tr.xyz");
+ assertEquals("xyz", URLUtil.getDomainName(url));
+
+ url = new URL("http://www.example.c.se");
+ assertEquals("example.c.se", URLUtil.getDomainName(url));
+
+ //plc.co.im is listed as a domain suffix
+ url = new URL("http://www.example.plc.co.im");
+ assertEquals("example.plc.co.im", URLUtil.getDomainName(url));
+
+ //2000.hu is listed as a domain suffix
+ url = new URL("http://www.example.2000.hu");
+ assertEquals("example.2000.hu", URLUtil.getDomainName(url));
+
+ //test non-ascii
+ url = new URL("http://www.example.商業.tw");
+ assertEquals("example.商業.tw", URLUtil.getDomainName(url));
+
+ }
+
+ public void testGetDomainSuffix() throws Exception{
+ URL url = null;
+
+ url = new URL("http://lucene.apache.org/nutch");
+ assertEquals("org", URLUtil.getDomainSuffix(url).getDomain());
+
+ url = new URL("http://140.211.11.130/foundation/contributing.html");
+ assertNull(URLUtil.getDomainSuffix(url));
+
+ url = new URL("http://www.example.co.uk:8080/index.html");
+ assertEquals("co.uk", URLUtil.getDomainSuffix(url).getDomain());
+
+ url = new URL("http://com");
+ assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+
+ url = new URL("http://www.example.co.uk.com");
+ assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
+
+ //"nn" is not a tld
+ url = new URL("http://example.com.nn");
+ assertNull(URLUtil.getDomainSuffix(url));
+
+ url = new URL("http://");
+ assertNull(URLUtil.getDomainSuffix(url));
+
+ url = new URL("http://www.edu.tr.xyz");
+ assertNull(URLUtil.getDomainSuffix(url));
+
+ url = new URL("http://subdomain.example.edu.tr");
+ assertEquals("edu.tr", URLUtil.getDomainSuffix(url).getDomain());
+
+ url = new URL("http://subdomain.example.presse.fr");
+ assertEquals("presse.fr", URLUtil.getDomainSuffix(url).getDomain());
+
+ url = new URL("http://subdomain.example.presse.tr");
+ assertEquals("tr", URLUtil.getDomainSuffix(url).getDomain());
+
+ //plc.co.im is listed as a domain suffix
+ url = new URL("http://www.example.plc.co.im");
+ assertEquals("plc.co.im", URLUtil.getDomainSuffix(url).getDomain());
+
+ //2000.hu is listed as a domain suffix
+ url = new URL("http://www.example.2000.hu");
+ assertEquals("2000.hu", URLUtil.getDomainSuffix(url).getDomain());
+
+ //test non-ascii
+ url = new URL("http://www.example.商業.tw");
+ assertEquals("商業.tw", URLUtil.getDomainSuffix(url).getDomain());
+
+ }
+
+ public void testGetHostSegments() throws Exception{
+ URL url;
+ String[] segments;
+
+ url = new URL("http://subdomain.example.edu.tr");
+ segments = URLUtil.getHostSegments(url);
+ assertEquals("subdomain", segments[0]);
+ assertEquals("example", segments[1]);
+ assertEquals("edu", segments[2]);
+ assertEquals("tr", segments[3]);
+
+ url = new URL("http://");
+ segments = URLUtil.getHostSegments(url);
+ assertEquals(1, segments.length);
+ assertEquals("", segments[0]);
+
+ url = new URL("http://140.211.11.130/foundation/contributing.html");
+ segments = URLUtil.getHostSegments(url);
+ assertEquals(1, segments.length);
+ assertEquals("140.211.11.130", segments[0]);
+
+ //test non-ascii
+ url = new URL("http://www.example.商業.tw");
+ segments = URLUtil.getHostSegments(url);
+ assertEquals("www", segments[0]);
+ assertEquals("example", segments[1]);
+ assertEquals("商業", segments[2]);
+ assertEquals("tw", segments[3]);
+
+ }
+
+}

0 comments on commit 2865e0e

Please sign in to comment.