Permalink
Browse files

first pass at GBIF crawler

  • Loading branch information...
jhpoelen committed Aug 24, 2018
1 parent 979435d commit e21aee143c7c77e1d87577a214ffaf759f183e44
@@ -0,0 +1,3 @@
.idea/*
target/*
*.iml
@@ -0,0 +1,6 @@
#!/bin/bash
if [ -n "$TRAVIS_TAG" ]; then
mvn -s .travis.maven.settings.xml -DskipTests clean deploy
VERSION=${TRAVIS_TAG//[^0-9.]/}
cp target/elton-${VERSION}-jar-with-dependencies.jar elton.jar
fi
@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>

<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 http://maven.apache.org/xsd/settings-1.0.0.xsd">
<servers>
<server>
<id>globi-release</id>
<username>${env.S3_USER}</username>
<password>${env.S3_PASSWORD}</password>
</server>
</servers>
</settings>
@@ -0,0 +1,5 @@
#!/bin/bash
if [ -n "$TRAVIS_TAG" ]; then
VERSION=${TRAVIS_TAG//[^0-9.]/}
mvn versions:set -DnewVersion=$VERSION
fi
@@ -0,0 +1,22 @@
language: java
jdk:
- oraclejdk8
install: mvn --version
before_script: "./.travis.version_if_tag.sh"
after_success: "./.travis.deploy_if_tag.sh"
cache:
directories:
- "$HOME/.m2"
env:
global:
- secure: LxBf+LzoRJTu50JcvegltG2M6DelYnN/sJRXwiSVRzXs2vcmrokb5I/LW2ORw5dzf3j5LtHnBeiNdcgD5xh2dqDkBncpQK1yVvPHa75oeagdWYUMk6KQBERzkHmuepiEy52sJPD88fgq1Es0QHqd54tZH4vbs0SS3QgQwupxrlNfRkCOMp1M9y4oa9hrMpKeRG572HP/BEsXp5in5++r1MOWy6fKp6mS0PXlM4xbHrXx4hPgcJ9KVVjV4tWT64eOt0goWpPwQvh5M1aguX5THs+xAfZRUDl8iA1RrSEjAkdOAN+qJ0l2r/8XvRVg35dfKaIg5NOiJ+S6tXPsvTxu5I1K2vB39FzLNFwkAinE7FRmy/XxvVDw0QosAoeCG2O5n6+RlXXCSi6LJ+NH06B0MxE6eUN+BUgi28aG/mjtEbwUrYiHf4nh0jsm7k7efG4UBGZXqPNc1p1Lj4IEMWUOpscJXP77jZPLz9vYIdEz9jxd830SfcTptNZOg3UmwsqRJCGJOGaUGvhDl/RtL9j4HhJDzUmT/dbZ4E9hMaAkfmZMz+KtrdVk/rZRW2+cXdcMIg0+dvIBAz1Gh7N8uiEjJ4U7AlltR+szdj/ZbAKq5eT4BKB6KXW4apiR5BqsLAgDfHpbq1Ot+S/SzPOx+SiEQjMsBe34l8vdO9HAhujmSvM=
- secure: b8wGdKqlOKUGuLjeHGKk7D5mgYHlqwOJAWHlgdsGpCpvz82YRSsYIoOLD8cNPiJ7Jt0I2f9cMY6q8DwDwYML48CGYlXy3UYvUzkMBRR6ZPbp9lJCiMscWGrLwNLhUF27uS+/ZlRzEyslhBLJH9EdCbfPDy+t9XSFR6q2b5kR+aNcyyVirU/0DMzIB7gNzillo1ZTZEP5KYu43Z5B8cVwD+tGKTrPweaNyhQkt24V/lnhgmp4AqRQ/YdtpqBG8NN2Zmi4bRZHhzgu4pBUETzPqtdPC0u5829scYjxv2BWLPbfffxbByUG5jXwZwwaPE35Ccw1NMZRfyaBv6GlOBYtHvXBb+0Azeo4GYUdTBsgI7xMClGJPoRKn9WIKDukFf35ZUdJwPjzfEQjNml7OMzzDdTzVLO4Uq6h035+gyB0AZ1p0/7EeEZl06ZU2FjUTtFQV+AOFH8twGUIFkzm84NOfwFhdzPIGGzKBf0O7UiybuggKds51Ts87qE6m1aekG2AWspTFPsfHfVmxM7tDQ9U+ZoR+bHIkKj4OHmXRx4Xfg3cUBsKI8CSu9t3QQXdfSW+EzdQ4a7n9PKXYXZRCG+14tgVGwf4IaE4AXqCC5352LwIUA2AtKOX4CCgUWrGSCn3oJMm7lBQOe08Au5JfbqgQZSFiL72GmmOKGyXYhwkEuE=
deploy:
provider: releases
api_key:
secure: u+jlC0amA3p1CDy1788SsOiwSHUafkc5WXtHXnvuR4ZOJxJsjZ/OIZFIF2/uH4XN+xYqRDO5SX98YOYlbU2FAyY97m/pEXr+uc+nK1l+0nuN/4NTkeAIouHm0bLq8WUUbfs3n1p92/opL08iieFuZKOiBrbwH7S/0+9J20ZIF/JCgnOJuGt5lX8YjyHJsLQq+RrKC949O7AAiv/8FvTfxtyZ8GjE9p2ChQ5pnVTzkPDef1TW1jmRc3baSZmQDnWgOPPSplx0qVO4GxzUiK+Ky9XsVt9ACUaqkldd8YUkQVKjtkPFzZE/PR7s2Ax6plaW7uGl4NEx7233wvi1+c5dPkMqL5Q7GjlcJxtLgiVia0tz8wj9lAzVvk4niHkrlIV9/kMRdi1bbuJ+exigd2kkyEYFhvMY3csnrftYp1ZnbSnw77+qw7cv6E/rS7L9zLOQ7HGs1lsAEfPtQHo0ET5WTG/UIjQSO6KWentTzC/OEdRwaOQvFTM3ZW+3UihcgwoL/BjWtQa43WTHmCGM5Jboqqtr8NG7WbzJhlw6uIe0BbmtKIcoqbfb4BIALQlHSNIsBfYsySu1eKpGYN4QHyBxJG1+Xs0xAKR8zTtbNdIxwoKTAsfhQ6U4LS6Kc76JWkgljaId66YBYB1IN42s2AI95fbP6HEEB/73z5su0LN45uM=
file: elton.jar
skip_cleanup: true
on:
repo: globalbioticinteractions/elton
tags: true
120 pom.xml
@@ -0,0 +1,120 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.globalbioticinteractions</groupId>
<artifactId>preston</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>Preston commandline tool</name>
<description>Discover, cache and check occurrence data</description>

<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.4</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.6</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.9.6</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
<version>1.72</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<extensions>
<extension>
<groupId>net.adamcin.org.kuali.maven.wagons</groupId>
<artifactId>maven-s3-wagon</artifactId>
<version>1.2.2</version>
</extension>
</extensions>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>versions-maven-plugin</artifactId>
<version>2.2</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>org.globalbioticinteractions.preston.Preston</mainClass>
</manifest>
<manifestEntries>
<Implementation-Version>${project.version}</Implementation-Version>
<Project-Version>${project.version}</Project-Version>
</manifestEntries>
</archive>
</configuration>
<executions>
<execution>
<id>make-uber-jar</id>
<!-- ensure that this doesn't run on travis -->
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

<repositories>
<repository>
<id>globi-release-http</id>
<name>Release Repository</name>
<url>https://depot.globalbioticinteractions.org/release</url>
<snapshots>
<enabled>false</enabled>
</snapshots>
<releases>
<enabled>true</enabled>
</releases>
</repository>
</repositories>

<distributionManagement>
<repository>
<id>globi-release</id>
<name>AWS Release Repository</name>
<url>s3://globi/release</url>
</repository>
</distributionManagement>

</project>
@@ -0,0 +1,7 @@
package org.globalbioticinteractions.preston;

import java.io.IOException;

public interface Crawler {
void crawl(DatasetListener listener) throws IOException;
}
@@ -0,0 +1,97 @@
package org.globalbioticinteractions.preston;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import java.util.UUID;

public class CrawlerGBIF implements Crawler {

public static final Map<String, DatasetType> TYPE_MAP = new HashMap<String, DatasetType>() {{
put("DWC_ARCHIVE", DatasetType.DARWIN_CORE_ARCHIVE);
put("EML", DatasetType.EML);
}};

public static boolean parse(InputStream resourceAsStream, DatasetListener listener) throws IOException {
JsonNode jsonNode = new ObjectMapper().readTree(resourceAsStream);
if (jsonNode != null && jsonNode.has("results")) {
for (JsonNode result : jsonNode.get("results")) {
if (result.has("key") && result.has("endpoints")) {

String uuid = result.get("key").asText();
for (JsonNode endpoint : result.get("endpoints")) {
if (endpoint.has("url") && endpoint.has("type")) {
String urlString = endpoint.get("url").asText();
URI url = URI.create(urlString);
String type = endpoint.get("type").asText();
DatasetType datasetType = TYPE_MAP.get(type);
if (datasetType != null) {
listener.onDataset(new Dataset(UUID.fromString(uuid), url, datasetType));
}
}
}
}
}
}
return !jsonNode.has("endOfRecords") || jsonNode.get("endOfRecords").asBoolean(true);
}

@Override
public void crawl(DatasetListener listener)
throws IOException {
int offset = 0;
int limit = 50;
boolean endOfRecords = false;

while (!endOfRecords) {
endOfRecords = crawlPage(listener, offset, limit);
offset = offset + limit;

}
}

protected boolean crawlPage(DatasetListener listener, int offset, int limit) throws IOException {
boolean endOfRecords;
int soTimeoutMs = 300000;
RequestConfig config = RequestConfig.custom().setSocketTimeout(soTimeoutMs).setConnectTimeout(soTimeoutMs).build();
CloseableHttpClient client = HttpClientBuilder.create().setRetryHandler(new DefaultHttpRequestRetryHandler(3, true)).setUserAgent("globalbioticinteractions/" + Preston.getVersion() + " (https://globalbioticinteractions.org; mailto:info@globalbioticinteractions.org)").setDefaultRequestConfig(config).build();
try {
HttpGet get = new HttpGet("https://api.gbif.org/v1/dataset?offset=" + offset + "&limit=" + limit);
get.setHeader("Accept", "application/json;charset=UTF-8");
get.setHeader("Content-Type", "application/json;charset=UTF-8");
get.setHeader(HttpHeaders.ACCEPT_ENCODING, "gzip");

CloseableHttpResponse response = client.execute(get);

StatusLine statusLine = response.getStatusLine();
HttpEntity entity = response.getEntity();
if (statusLine.getStatusCode() >= 300) {
EntityUtils.consume(entity);
throw new HttpResponseException(statusLine.getStatusCode(), statusLine.getReasonPhrase());
} else {
endOfRecords = parse(entity.getContent(), listener);
}
} finally {
client.close();
}
return endOfRecords;
}
}
@@ -0,0 +1,28 @@
package org.globalbioticinteractions.preston;

import java.net.URI;
import java.util.UUID;

public class Dataset {
private final UUID uuid;
private final URI url;
private final DatasetType type;

public Dataset(UUID uuid, URI url, DatasetType type) {
this.uuid = uuid;
this.url = url;
this.type = type;
}

public UUID getUuid() {
return uuid;
}

public URI getUrl() {
return url;
}

public DatasetType getType() {
return type;
}
}
@@ -0,0 +1,5 @@
package org.globalbioticinteractions.preston;

public interface DatasetListener {
void onDataset(Dataset dataset);
}
@@ -0,0 +1,6 @@
package org.globalbioticinteractions.preston;

public enum DatasetType {
DARWIN_CORE_ARCHIVE,
EML
}
@@ -0,0 +1,31 @@
package org.globalbioticinteractions.preston;

/*
Preston - a commandline tool to help discover, access and cache source occurrence data archives.
*/

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.globalbioticinteractions.preston.cmd.CmdLine;

import static java.lang.System.exit;

public class Preston {
private static final Log LOG = LogFactory.getLog(Preston.class);

public static void main(String[] args) {
try {
CmdLine.run(args);
exit(0);
} catch (Throwable t) {
exit(1);
}
}

public static String getVersion() {
String version = Preston.class.getPackage().getImplementationVersion();
return StringUtils.isBlank(version) ? "dev" : version;
}

}
Oops, something went wrong.

0 comments on commit e21aee1

Please sign in to comment.