Skip to content
Permalink
Browse files
TENTACLES-9: Add patch to have a retr strategy during crawl phase (pa…
…tch)

git-svn-id: https://svn.apache.org/repos/asf/creadur/tentacles/trunk@1714849 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
ottlinger committed Nov 17, 2015
1 parent c095816 commit a02d3be723e8b2b153b0e761f72e8c16316f69a3
Showing 4 changed files with 131 additions and 125 deletions.
@@ -11,3 +11,4 @@ Tentacles 0.1 SNAPSHOT
* [TENTACLES-3] - provide help text if runtime parameters are missing
* [TENTACLES-2] - use proper escaping in Velocity template files.
* [TENTACLES-1] - allow filtering of directories in LicenseFilter
* [TENTACLES-9] - adding retry during crawl (thanks to Andy Gumbrecht)
@@ -20,7 +20,7 @@
<parent>
<groupId>org.apache</groupId>
<artifactId>apache</artifactId>
<version>14</version>
<version>17</version>
</parent>
<groupId>org.apache.creadur.tentacles</groupId>
<artifactId>apache-tentacles</artifactId>
@@ -74,7 +74,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<javaVersion>1.6</javaVersion>
<httpClientVersion>4.3.5</httpClientVersion>
<httpClientVersion>4.3.6</httpClientVersion>
<apacheRatVersion>0.11</apacheRatVersion>
</properties>
<issueManagement>
@@ -16,26 +16,12 @@
*/
package org.apache.creadur.tentacles;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.Flushable;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.log4j.Logger;

import java.io.*;
import java.net.URL;
import java.util.zip.ZipInputStream;

import org.apache.log4j.Logger;

/**
* @version $Rev$ $Date$
*/
@@ -117,12 +103,12 @@ public void close(final Closeable closeable) throws IOException {
((Flushable) closeable).flush();
}
} catch (final IOException e) {
LOG.error("Error when trying to flush before closing " + closeable, e);
LOG.trace("Error when trying to flush before closing " + closeable, e);
}
try {
closeable.close();
} catch (final IOException e) {
LOG.error("Error when trying to close " + closeable, e);
LOG.trace("Error when trying to close " + closeable, e);
}
}

@@ -16,152 +16,171 @@
*/
package org.apache.creadur.tentacles;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.LinkedHashSet;
import java.util.Set;

import org.apache.http.Header;
import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.log4j.Logger;
import org.codehaus.swizzle.stream.StreamLexer;

public class NexusClient {

private static final Logger log = Logger.getLogger(NexusClient.class);
private static final String SLASH = "/";
private static final String ONE_UP = "../";
private static final String USER_AGENT_CONTENTS = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13";
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.LinkedHashSet;
import java.util.Set;

private final CloseableHttpClient client;
private final FileSystem fileSystem;
private final IOSystem ioSystem;
public class NexusClient {

public NexusClient(final Platform platform) {
private static final Logger log = Logger.getLogger(NexusClient.class);
private static final String SLASH = "/";
private static final String ONE_UP = "../";
private static final String USER_AGENT_CONTENTS = "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101206 Ubuntu/10.10 (maverick) Firefox/3.6.13";

System.setProperty("http.keepAlive", "false");
System.setProperty("http.maxConnections", "50");
private final CloseableHttpClient client;
private final FileSystem fileSystem;
private final IOSystem ioSystem;
private final int retries;

this.client = HttpClientBuilder.create().disableContentCompression()
.build();
this.fileSystem = platform.getFileSystem();
this.ioSystem = platform.getIoSystem();
}
public NexusClient(final Platform platform) {

public File download(final URI uri, final File file) throws IOException {
if (file.exists()) {
System.setProperty("http.keepAlive", "false");
System.setProperty("http.maxConnections", "50");

final long length = getContentLength(uri);
this.retries = Integer.parseInt(System.getProperty("NexusClient.retries", "5"));

if (file.length() == length) {
log.info("Exists " + uri);
return file;
} else {
log.info("Incomplete " + uri);
}
}
this.client = HttpClientBuilder.create().disableContentCompression()
.build();
this.fileSystem = platform.getFileSystem();
this.ioSystem = platform.getIoSystem();
}

log.info("Download " + uri);
public File download(final URI uri, final File file) throws IOException {
if (file.exists()) {

final CloseableHttpResponse response = get(uri);
final long length = getContentLength(uri);

InputStream content = null;
try {
content = response.getEntity().getContent();
if (file.length() == length) {
log.info("Exists " + uri);
return file;
} else {
log.info("Incomplete " + uri);
}
}

this.fileSystem.mkparent(file);
log.info("Download " + uri);

this.ioSystem.copy(content, file);
} finally {
if (content != null) {
content.close();
}
final CloseableHttpResponse response = get(uri);

response.close();
}
InputStream content = null;
try {
content = response.getEntity().getContent();

return file;
}
this.fileSystem.mkparent(file);

private Long getContentLength(final URI uri) throws IOException {
final CloseableHttpResponse head = head(uri);
final Header[] headers = head.getHeaders(HttpHeaders.CONTENT_LENGTH);
this.ioSystem.copy(content, file);
} finally {
if (content != null) {
content.close();
}

if (headers != null && headers.length >= 1) {
return Long.valueOf(headers[0].getValue());
}
response.close();
}

head.close();
return file;
}

return Long.valueOf(-1);
}
private Long getContentLength(final URI uri) throws IOException {
final CloseableHttpResponse head = head(uri);
final Header[] headers = head.getHeaders(HttpHeaders.CONTENT_LENGTH);

private CloseableHttpResponse get(final URI uri) throws IOException {
final HttpGet request = new HttpGet(uri);
request.setHeader(HttpHeaders.USER_AGENT, USER_AGENT_CONTENTS);
return this.client.execute(request);
}
if (headers != null && headers.length >= 1) {
return Long.valueOf(headers[0].getValue());
}

private CloseableHttpResponse head(final URI uri) throws IOException {
final HttpHead request = new HttpHead(uri);
request.setHeader(HttpHeaders.USER_AGENT, USER_AGENT_CONTENTS);
return this.client.execute(request);
}
head.close();

public Set<URI> crawl(final URI index) throws IOException {
log.info("Crawl " + index);
final Set<URI> resources = new LinkedHashSet<URI>();
return (long) -1;
}

final CloseableHttpResponse response = get(index);
private CloseableHttpResponse get(final URI uri) throws IOException {
return get(new HttpGet(uri), this.retries);
}

final InputStream content = response.getEntity().getContent();
final StreamLexer lexer = new StreamLexer(content);
private CloseableHttpResponse head(final URI uri) throws IOException {
return get(new HttpHead(uri), this.retries);
}

final Set<URI> crawl = new LinkedHashSet<URI>();
private CloseableHttpResponse get(final HttpUriRequest request, int tries) throws IOException {
try {
request.setHeader(HttpHeaders.USER_AGENT, USER_AGENT_CONTENTS);
return this.client.execute(request);
} catch (final IOException e) {
if (tries > 0) {
try {
Thread.sleep(250);
} catch (final InterruptedException ie) {
Thread.interrupted();
throw new IOException("Interrupted", ie);
}
return get(request, tries--);
} else {
throw e;
}
}
}

// <a
// href="https://repository.apache.org/content/repositories/orgapacheopenejb-094/archetype-catalog.xml">archetype-catalog.xml</a>
while (lexer.readAndMark("<a ", "/a>")) {
public Set<URI> crawl(final URI index) throws IOException {
log.info("Crawl " + index);
final Set<URI> resources = new LinkedHashSet<URI>();

try {
final String link = lexer.peek("href=\"", "\"");
final String name = lexer.peek(">", "<");
final CloseableHttpResponse response = get(index);

final URI uri = index.resolve(link);
final InputStream content = response.getEntity().getContent();
final StreamLexer lexer = new StreamLexer(content);

if (name.equals(ONE_UP)) {
continue;
}
if (link.equals(ONE_UP)) {
continue;
}
final Set<URI> crawl = new LinkedHashSet<URI>();

if (name.endsWith(SLASH)) {
crawl.add(uri);
continue;
}
// <a
// href="https://repository.apache.org/content/repositories/orgapacheopenejb-094/archetype-catalog.xml">archetype-catalog.xml</a>
while (lexer.readAndMark("<a ", "/a>")) {

resources.add(uri);
try {
final String link = lexer.peek("href=\"", "\"");
final String name = lexer.peek(">", "<");

} finally {
lexer.unmark();
}
}
final URI uri = index.resolve(link);

if (name.equals(ONE_UP)) {
continue;
}
if (link.equals(ONE_UP)) {
continue;
}

content.close();
response.close();
if (name.endsWith(SLASH)) {
crawl.add(uri);
continue;
}

resources.add(uri);

for (final URI uri : crawl) {
resources.addAll(crawl(uri));
}
} finally {
lexer.unmark();
}
}

return resources;
}
content.close();
response.close();

for (final URI uri : crawl) {
resources.addAll(crawl(uri));
}

return resources;
}
}

0 comments on commit a02d3be

Please sign in to comment.