Skip to content

Commit

Permalink
NUTCH-2956 index-geoip: dependency upgrades and improvements
Browse files Browse the repository at this point in the history
- upgrade to geoip2 3.0.1
- exclude transitive dependencies (Jackson) provided as Nutch core deps
- read also GeoLite2-*.mmdb files
- review index field names in plugin and Nutch Solr schema:
  - fix typos in field names
  - remove unused fields from schema
  • Loading branch information
sebastian-nagel committed Aug 9, 2022
1 parent 01ab00b commit 8fc4f17
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 50 deletions.
3 changes: 2 additions & 1 deletion conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2112,7 +2112,8 @@ Add scoring-metadata to the list of active plugins
'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and
available at runtime.
available at runtime. Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb)
can be used.
</description>
</property>

Expand Down
11 changes: 5 additions & 6 deletions src/plugin/index-geoip/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,11 @@
</publications>

<dependencies>
<dependency org="com.maxmind.geoip2" name="geoip2" rev="2.12.0" >
<!-- Exlude due to classpath issues -->
<exclude org="org.apache.httpcomponents" name="httpclient" />
<exclude org="org.apache.httpcomponents" name="httpcore" />
<exclude org="commons-codec" name="commons-codec" />
<exclude org="commons-logging" name="commons-logging" />
<dependency org="com.maxmind.geoip2" name="geoip2" rev="3.0.1">
<!-- Exlude libs provided in Nutch core -->
<exclude org="com.fasterxml.jackson.core" name="jackson-annotations" />
<exclude org="com.fasterxml.jackson.core" name="jackson-databind" />
<exclude org="com.fasterxml.jackson.core" name="jackson-core" />
</dependency>
</dependencies>

Expand Down
7 changes: 2 additions & 5 deletions src/plugin/index-geoip/plugin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,8 @@
<library name="index-geoip.jar">
<export name="*"/>
</library>
<library name="geoip2-2.12.0.jar"/>
<library name="jackson-annotations-2.9.5.jar"/>
<library name="jackson-core-2.9.5.jar"/>
<library name="jackson-databind-2.9.5.jar"/>
<library name="maxmind-db-1.2.2.jar"/>
<library name="geoip2-3.0.1.jar"/>
<library name="maxmind-db-2.0.0.jar"/>
</runtime>

<requires>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,17 @@
package org.apache.nutch.indexer.geoip;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.InetAddress;
import java.net.UnknownHostException;

import org.apache.nutch.indexer.NutchDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.maxmind.geoip2.DatabaseReader;
import com.maxmind.geoip2.WebServiceClient;
import com.maxmind.geoip2.exception.AddressNotFoundException;
import com.maxmind.geoip2.exception.GeoIp2Exception;
import com.maxmind.geoip2.model.InsightsResponse;
import com.maxmind.geoip2.model.CityResponse;
Expand Down Expand Up @@ -54,28 +58,17 @@
*/
public class GeoIPDocumentCreator {

/**
* Add field to document but only if value isn't null
* @param doc the {@link NutchDocument} to augment
* @param name the name of the target field
* @param value the String value to associate with the target field
*/
public static void addIfNotNull(NutchDocument doc, String name,
String value) {
if (value != null) {
doc.add(name, value);
}
}
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());

/**
* Add field to document but only if value isn't null
* @param doc the {@link NutchDocument} to augment
* @param name the name of the target field
* @param value the {@link java.lang.Integer} value to
* associate with the target field
* @param value the String value to associate with the target field
*/
public static void addIfNotNull(NutchDocument doc, String name,
Integer value) {
Object value) {
if (value != null) {
doc.add(name, value);
}
Expand All @@ -87,7 +80,6 @@ public static NutchDocument createDocFromInsightsService(String serverIp,
addIfNotNull(doc, "ip", serverIp);
InsightsResponse response = client
.insights(InetAddress.getByName(serverIp));
// CityResponse response = client.city(InetAddress.getByName(serverIp));

City city = response.getCity();
addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
Expand All @@ -103,7 +95,7 @@ public static NutchDocument createDocFromInsightsService(String serverIp,
addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
addIfNotNull(doc, "countryName", country.getName()); // 'United States'
addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId());

Location location = response.getLocation();
addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
Expand All @@ -121,7 +113,7 @@ public static NutchDocument createDocFromInsightsService(String serverIp,

Subdivision subdivision = response.getMostSpecificSubdivision();
addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN'
addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());

Expand Down Expand Up @@ -169,7 +161,13 @@ public static NutchDocument createDocFromIspDb(String serverIp,
public static NutchDocument createDocFromDomainDb(String serverIp,
NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
DomainResponse response;
try {
response = reader.domain(InetAddress.getByName(serverIp));
} catch (AddressNotFoundException e) {
LOG.debug("IP address not found: {}", serverIp);
return doc;
}
addIfNotNull(doc, "ip", serverIp);
addIfNotNull(doc, "domain", response.getDomain());
return doc;
Expand All @@ -189,7 +187,14 @@ public static NutchDocument createDocFromCityDb(String serverIp,
NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
IOException, GeoIp2Exception {
addIfNotNull(doc, "ip", serverIp);
CityResponse response = reader.city(InetAddress.getByName(serverIp));

CityResponse response;
try {
response = reader.city(InetAddress.getByName(serverIp));
} catch (AddressNotFoundException e) {
LOG.debug("IP address not found: {}", serverIp);
return doc;
}

City city = response.getCity();
addIfNotNull(doc, "cityName", city.getName()); // 'Minneapolis'
Expand All @@ -206,7 +211,7 @@ public static NutchDocument createDocFromCityDb(String serverIp,
addIfNotNull(doc, "countryIsoCode", country.getIsoCode()); // 'US'
addIfNotNull(doc, "countryName", country.getName()); // 'United States'
addIfNotNull(doc, "countryConfidence", country.getConfidence()); // 99
addIfNotNull(doc, "countryGeoName", country.getGeoNameId());
addIfNotNull(doc, "countryGeoNameId", country.getGeoNameId());

Location location = response.getLocation();
addIfNotNull(doc, "latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
Expand All @@ -224,7 +229,7 @@ public static NutchDocument createDocFromCityDb(String serverIp,

Subdivision subdivision = response.getMostSpecificSubdivision();
addIfNotNull(doc, "subDivName", subdivision.getName()); // 'Minnesota'
addIfNotNull(doc, "subDivIdoCode", subdivision.getIsoCode()); // 'MN'
addIfNotNull(doc, "subDivIsoCode", subdivision.getIsoCode()); // 'MN'
addIfNotNull(doc, "subDivConfidence", subdivision.getConfidence()); // 90
addIfNotNull(doc, "subDivGeoNameId", subdivision.getGeoNameId());
return doc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@
* 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
* Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
* GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath
* and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
* and available at runtime. This can be achieved by adding it to `$NUTCH_HOME/conf`.
* Alternatively, also the GeoLite2 IP databases (GeoLite2-*.mmdb) can be used.
* </description>
* </property>
*
Expand Down Expand Up @@ -152,24 +153,29 @@ public void setConf(Configuration conf) {
conf.getInt("index.geoip.userid", 12345),
conf.get("index.geoip.licensekey")).build();
} else {
String db = null;
String dbSuffix = null;
if (usage.equalsIgnoreCase("cityDatabase")) {
db = "GeoIP2-City.mmdb";
dbSuffix = "-City.mmdb";
} else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
db = "GeoIP2-Connection-Type.mmdb";
dbSuffix = "-Connection-Type.mmdb";
} else if (usage.equalsIgnoreCase("domainDatabase")) {
db = "GeoIP2-Domain.mmdb";
dbSuffix = "-Domain.mmdb";
} else if (usage.equalsIgnoreCase("ispDatabase")) {
db = "GeoIP2-ISP.mmdb";
dbSuffix = "-ISP.mmdb";
}
URL dbFileUrl = conf.getResource(db);
if (dbFileUrl == null) {
LOG.error("GeoDb file {} not found on classpath", db);
} else {
try {
buildDb(new File(dbFileUrl.getFile()));
} catch (Exception e) {
LOG.error("Failed to read geoDb file {}: ", db, e);
String[] dbPrefixes = {"GeoIP2", "GeoLite2"};
for (String dbPrefix : dbPrefixes) {
String db = dbPrefix + dbSuffix;
URL dbFileUrl = conf.getResource(db);
if (dbFileUrl == null) {
LOG.error("GeoDb file {} not found on classpath", db);
} else {
try {
LOG.info("Reading GeoDb file {}", db);
buildDb(new File(dbFileUrl.getFile()));
} catch (Exception e) {
LOG.error("Failed to read geoDb file {}: ", db, e);
}
}
}
}
Expand Down
3 changes: 1 addition & 2 deletions src/plugin/indexer-solr/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@
<field name="cityGeoNameId" type="int" stored="true" indexed="true" />
<field name="continentCode" type="string" stored="true" indexed="true" />
<field name="continentGeoNameId" type="int" stored="true" indexed="true" />
<field name="contentName" type="string" stored="true" indexed="true" />
<field name="continentName" type="string" stored="true" indexed="true" />
<field name="countryIsoCode" type="string" stored="true" indexed="true"/>
<field name="countryName" type="string" stored="true" indexed="true" />
<field name="countryConfidence" type="int" stored="true" indexed="true"/>
Expand All @@ -379,7 +379,6 @@
<field name="org" type="string" stored="true" indexed="true" />
<field name="userType" type="string" stored="true" indexed="true" />
<field name="isAnonProxy" type="boolean" stored="true" indexed="true" />
<field name="isSatelitteProv" type="boolean" stored="true" indexed="true" />
<field name="connType" type="string" stored="true" indexed="true" />
<field name="location" type="location" stored="true" indexed="true" />

Expand Down

0 comments on commit 8fc4f17

Please sign in to comment.