Skip to content

Commit

Permalink
Initial implementation of Connection
Browse files Browse the repository at this point in the history
  • Loading branch information
jhy committed Aug 16, 2010
1 parent 6e5e8a2 commit f737aa1
Show file tree
Hide file tree
Showing 6 changed files with 244 additions and 126 deletions.
13 changes: 10 additions & 3 deletions src/main/java/org/jsoup/Connection.java
Expand Up @@ -5,6 +5,7 @@
import java.net.URL;
import java.util.Map;
import java.util.Collection;
import java.io.IOException;

/**
* DRAFT interface to support HTTP connections.
Expand Down Expand Up @@ -36,11 +37,11 @@ public enum Method {

public Connection cookie(String name, String value);

public Document get();
public Document get() throws IOException;

public Document post();
public Document post() throws IOException;

public Response execute();
public Response execute() throws IOException;

public Request request();

Expand Down Expand Up @@ -97,6 +98,12 @@ public interface Request extends Base<Request> {
public interface Response extends Base<Response> {
public int statusCode();

public String statusMessage();

public String charset();

public Document parse();

public String body();

public byte[] bodyAsBytes();
Expand Down
10 changes: 9 additions & 1 deletion src/main/java/org/jsoup/Jsoup.java
Expand Up @@ -4,6 +4,8 @@
import org.jsoup.parser.Parser;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.helper.DataUtil;
import org.jsoup.helper.HttpConnection;

import java.io.File;
import java.io.IOException;
Expand Down Expand Up @@ -54,7 +56,13 @@ public static Document parse(String html) {
the response stream.
*/
public static Document parse(URL url, int timeoutMillis) throws IOException {
return DataUtil.load(url, timeoutMillis);
Connection con = HttpConnection.connect(url);
con.timeout(timeoutMillis / 1000);
return con.get();
}

public static Connection connect(String url) {
return HttpConnection.connect(url);
}

/**
Expand Down
@@ -1,12 +1,10 @@
package org.jsoup;
package org.jsoup.helper;

import org.jsoup.helper.Validate;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
Expand All @@ -16,9 +14,9 @@
* Internal static utilities for handling data.
*
*/
class DataUtil {
public class DataUtil {
private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=([^\\s;]*)");
private static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
private static final int bufferSize = 0x20000; // ~130K.

/**
Expand All @@ -28,60 +26,17 @@ class DataUtil {
* @return
* @throws IOException
*/
static Document load(File in, String charsetName, String baseUri) throws IOException {
public static Document load(File in, String charsetName, String baseUri) throws IOException {
InputStream inStream = new FileInputStream(in);
Document doc = readInputStream(inStream, charsetName, baseUri);
inStream.close();
return doc;
}

/**
Fetches a URL and gets as a string.
@param url
@param timeoutMillis
@return
@throws IOException
*/
static Document load(URL url, int timeoutMillis) throws IOException {
String protocol = url.getProtocol();
Validate.isTrue(protocol.equals("http") || protocol.equals("https"), "Only http & https protocols supported");

HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setInstanceFollowRedirects(true);
conn.setConnectTimeout(timeoutMillis);
conn.setReadTimeout(timeoutMillis);
conn.connect();

int res = conn.getResponseCode();
if (res != HttpURLConnection.HTTP_OK)
throw new IOException(res + " error loading URL " + url.toString());

String contentType = conn.getContentType();
if (contentType == null || !contentType.startsWith("text/"))
throw new IOException(String.format("Unhandled content type \"%s\" on URL %s. Must be text/*",
contentType, url.toString()));

InputStream inStream = new BufferedInputStream(conn.getInputStream());
String charSet = getCharsetFromContentType(contentType); // may be null, readInputStream deals with it

Document doc = readInputStream(inStream, charSet, url.toExternalForm());
ByteBuffer byteData = readToByteBuffer(inStream);
Document doc = parseByteData(byteData, charsetName, baseUri);
inStream.close();
return doc;
}

// reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
// switching the chartset midstream when a meta http-equiv tag defines the charset.
private static Document readInputStream(InputStream inStream, String charsetName, String baseUri) throws IOException {
byte[] buffer = new byte[bufferSize];
ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
int read;
while(true) {
read = inStream.read(buffer);
if (read == -1) break;
outStream.write(buffer, 0, read);
}
ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());

static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri) {
String docData;
Document doc = null;
if (charsetName == null) { // determine from meta. safe parse as UTF-8
Expand All @@ -107,7 +62,20 @@ private static Document readInputStream(InputStream inStream, String charsetName
}
return doc;
}


static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException {
byte[] buffer = new byte[bufferSize];
ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
int read;
while(true) {
read = inStream.read(buffer);
if (read == -1) break;
outStream.write(buffer, 0, read);
}
ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
return byteData;
}

/**
* Parse out a charset from a content type header.
* @param contentType e.g. "text/html; charset=EUC-JP"
Expand Down

0 comments on commit f737aa1

Please sign in to comment.