Permalink
Browse files

Initial implementation of Connection

  • Loading branch information...
1 parent 6e5e8a2 commit f737aa15dc3da5f8ca0a79898c456fe985452931 @jhy jhy committed Aug 16, 2010
View
13 src/main/java/org/jsoup/Connection.java
@@ -5,6 +5,7 @@
import java.net.URL;
import java.util.Map;
import java.util.Collection;
+import java.io.IOException;
/**
* DRAFT interface to support HTTP connections.
@@ -36,11 +37,11 @@
public Connection cookie(String name, String value);
- public Document get();
+ public Document get() throws IOException;
- public Document post();
+ public Document post() throws IOException;
- public Response execute();
+ public Response execute() throws IOException;
public Request request();
@@ -97,6 +98,12 @@
public interface Response extends Base<Response> {
public int statusCode();
+ public String statusMessage();
+
+ public String charset();
+
+ public Document parse();
+
public String body();
public byte[] bodyAsBytes();
View
10 src/main/java/org/jsoup/Jsoup.java
@@ -4,6 +4,8 @@
import org.jsoup.parser.Parser;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
+import org.jsoup.helper.DataUtil;
+import org.jsoup.helper.HttpConnection;
import java.io.File;
import java.io.IOException;
@@ -54,7 +56,13 @@ public static Document parse(String html) {
the response stream.
*/
public static Document parse(URL url, int timeoutMillis) throws IOException {
- return DataUtil.load(url, timeoutMillis);
+ Connection con = HttpConnection.connect(url);
+ con.timeout(timeoutMillis / 1000);
+ return con.get();
+ }
+
+ public static Connection connect(String url) {
+ return HttpConnection.connect(url);
}
/**
View
76 src/main/java/org/jsoup/DataUtil.java → src/main/java/org/jsoup/helper/DataUtil.java
@@ -1,12 +1,10 @@
-package org.jsoup;
+package org.jsoup.helper;
-import org.jsoup.helper.Validate;
+import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.*;
-import java.net.HttpURLConnection;
-import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
@@ -16,9 +14,9 @@
* Internal static utilities for handling data.
*
*/
-class DataUtil {
+public class DataUtil {
private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=([^\\s;]*)");
- private static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
+ static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
private static final int bufferSize = 0x20000; // ~130K.
/**
@@ -28,60 +26,17 @@
* @return
* @throws IOException
*/
- static Document load(File in, String charsetName, String baseUri) throws IOException {
+ public static Document load(File in, String charsetName, String baseUri) throws IOException {
InputStream inStream = new FileInputStream(in);
- Document doc = readInputStream(inStream, charsetName, baseUri);
- inStream.close();
- return doc;
- }
-
- /**
- Fetches a URL and gets as a string.
- @param url
- @param timeoutMillis
- @return
- @throws IOException
- */
- static Document load(URL url, int timeoutMillis) throws IOException {
- String protocol = url.getProtocol();
- Validate.isTrue(protocol.equals("http") || protocol.equals("https"), "Only http & https protocols supported");
-
- HttpURLConnection conn = (HttpURLConnection) url.openConnection();
- conn.setInstanceFollowRedirects(true);
- conn.setConnectTimeout(timeoutMillis);
- conn.setReadTimeout(timeoutMillis);
- conn.connect();
-
- int res = conn.getResponseCode();
- if (res != HttpURLConnection.HTTP_OK)
- throw new IOException(res + " error loading URL " + url.toString());
-
- String contentType = conn.getContentType();
- if (contentType == null || !contentType.startsWith("text/"))
- throw new IOException(String.format("Unhandled content type \"%s\" on URL %s. Must be text/*",
- contentType, url.toString()));
-
- InputStream inStream = new BufferedInputStream(conn.getInputStream());
- String charSet = getCharsetFromContentType(contentType); // may be null, readInputStream deals with it
-
- Document doc = readInputStream(inStream, charSet, url.toExternalForm());
+ ByteBuffer byteData = readToByteBuffer(inStream);
+ Document doc = parseByteData(byteData, charsetName, baseUri);
inStream.close();
return doc;
}
// reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
// switching the chartset midstream when a meta http-equiv tag defines the charset.
- private static Document readInputStream(InputStream inStream, String charsetName, String baseUri) throws IOException {
- byte[] buffer = new byte[bufferSize];
- ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
- int read;
- while(true) {
- read = inStream.read(buffer);
- if (read == -1) break;
- outStream.write(buffer, 0, read);
- }
- ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
-
+ static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri) {
String docData;
Document doc = null;
if (charsetName == null) { // determine from meta. safe parse as UTF-8
@@ -107,7 +62,20 @@ private static Document readInputStream(InputStream inStream, String charsetName
}
return doc;
}
-
+
+ static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException {
+ byte[] buffer = new byte[bufferSize];
+ ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
+ int read;
+ while(true) {
+ read = inStream.read(buffer);
+ if (read == -1) break;
+ outStream.write(buffer, 0, read);
+ }
+ ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
+ return byteData;
+ }
+
/**
* Parse out a charset from a content type header.
* @param contentType e.g. "text/html; charset=EUC-JP"
View
170 src/main/java/org/jsoup/helper/HttpConnection.java
@@ -2,18 +2,36 @@
import org.jsoup.Connection;
import org.jsoup.nodes.Document;
+import org.jsoup.parser.TokenQueue;
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.Collection;
-import java.util.ArrayList;
-
-/**
- * DRAFT implementation of Connection.
- */
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/** DRAFT implementation of Connection. */
public class HttpConnection implements Connection {
+ private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=([^\\s;]*)");
+
+ public static Connection connect(String url) {
+ Connection con = new HttpConnection();
+ con.url(url);
+ return con;
+ }
+
+ public static Connection connect(URL url) {
+ Connection con = new HttpConnection();
+ con.url(url);
+ return con;
+ }
+
private Connection.Request req;
private Connection.Response res;
@@ -69,8 +87,8 @@ public Connection data(Map<String, String> data) {
}
public Connection data(String... keyvals) {
- for (int i = 0; i < keyvals.length; i+=2) {
- req.data(KeyVal.create(keyvals[i], keyvals[i+1]));
+ for (int i = 0; i < keyvals.length; i += 2) {
+ req.data(KeyVal.create(keyvals[i], keyvals[i + 1]));
}
return this;
}
@@ -85,22 +103,20 @@ public Connection cookie(String name, String value) {
return this;
}
- public Document get() {
+ public Document get() throws IOException {
req.method(Method.GET);
execute();
- // todo: parse for doc
- return null;
+ return res.parse();
}
- public Document post() {
+ public Document post() throws IOException {
req.method(Method.POST);
execute();
- // todo: parse for doc
- return null;
+ return res.parse();
}
- public Connection.Response execute() {
- // todo: execute
+ public Connection.Response execute() throws IOException {
+ res = Response.execute(req);
return res;
}
@@ -124,10 +140,10 @@ public Connection response(Connection.Response response) {
@SuppressWarnings({"unchecked"})
private static abstract class Base<T extends Connection.Base> implements Connection.Base<T> {
- private URL url;
- private Method method;
- private Map<String, String> headers;
- private Map<String, String> cookies;
+ URL url;
+ Method method;
+ Map<String, String> headers;
+ Map<String, String> cookies;
private Base() {
headers = new LinkedHashMap<String, String>();
@@ -203,6 +219,7 @@ public T removeCookie(String name) {
private Request() {
data = new ArrayList<Connection.KeyVal>();
+ method = Connection.Method.GET;
}
public int timeout() {
@@ -226,17 +243,106 @@ public Request data(Connection.KeyVal keyval) {
public static class Response extends Base<Connection.Response> implements Connection.Response {
private int statusCode;
+ private String statusMessage;
+ private ByteBuffer byteData;
+ private String charset;
+
+ static Response execute(Connection.Request req) throws IOException {
+ URL url = req.url();
+ String protocol = url.getProtocol();
+ Validate
+ .isTrue(protocol.equals("http") || protocol.equals("https"), "Only http & https protocols supported");
+
+ HttpURLConnection conn = (HttpURLConnection) url.openConnection();
+ conn.setRequestMethod(req.method().name());
+ conn.setInstanceFollowRedirects(true);
+ conn.setConnectTimeout(req.timeout() * 1000);
+ conn.setReadTimeout(req.timeout() * 1000);
+ // todo: handle get params not in url, and post params
+ conn.connect();
+
+ // todo: error handling options, allow user to get !200 without exception
+ int status = conn.getResponseCode();
+ if (status != HttpURLConnection.HTTP_OK)
+ throw new IOException(status + " error loading URL " + url.toString());
+ Response res = new Response();
+ res.setupFromConnection(conn);
+
+ // todo: move to parse
+ String contentType = conn.getContentType();
+ if (contentType == null || !contentType.startsWith("text/"))
+ throw new IOException(String.format("Unhandled content type \"%s\" on URL %s. Must be text/*",
+ contentType, url.toString()));
+
+ InputStream inStream = new BufferedInputStream(conn.getInputStream());
+ res.byteData = DataUtil.readToByteBuffer(inStream);
+ res.charset = getCharsetFromContentType(contentType); // may be null, readInputStream deals with it
+ inStream.close();
+
+ return res;
+ }
public int statusCode() {
return statusCode;
}
+ public String statusMessage() {
+ return statusMessage;
+ }
+
+ public String charset() {
+ return charset;
+ }
+
+ public Document parse() {
+ Document doc = DataUtil.parseByteData(byteData, charset, url.toExternalForm());
+ byteData.rewind();
+ charset = doc.outputSettings().charset().name(); // update charset from meta-equiv, possibly
+ return doc;
+ }
+
public String body() {
- return null;
+ // gets set from header on execute, and from meta-equiv on parse. parse may not have happened yet
+ String body;
+ if (charset == null)
+ body = Charset.forName(DataUtil.defaultCharset).decode(byteData).toString();
+ else
+ body = Charset.forName(charset).decode(byteData).toString();
+ byteData.rewind();
+ return body;
}
public byte[] bodyAsBytes() {
- return new byte[0];
+ return byteData.array();
+ }
+
+ // set up url, method, header, cookies
+ private void setupFromConnection(HttpURLConnection conn) throws IOException {
+ method = Connection.Method.valueOf(conn.getRequestMethod());
+ url = conn.getURL();
+ statusCode = conn.getResponseCode();
+ statusMessage = conn.getResponseMessage();
+
+ Map<String, List<String>> resHeaders = conn.getHeaderFields();
+ for (Map.Entry<String, List<String>> entry : resHeaders.entrySet()) {
+ String name = entry.getKey();
+ if (name == null)
+ continue; // http/1.1 line
+
+ List<String> values = entry.getValue();
+
+ if (name.equals("Set-Cookie")) {
+ for (String value : values) {
+ TokenQueue cd = new TokenQueue(value);
+ String cookieName = cd.chompTo("=").trim();
+ String cookieVal = cd.consumeTo(";").trim();
+ // ignores path, date, domain, secure et al. req'd?
+ cookie(cookieName, cookieVal);
+ }
+ } else { // only take the first instance of each header
+ header(name, values.get(0));
+ }
+ }
}
}
@@ -271,4 +377,20 @@ public String value() {
return value;
}
}
+
+ /**
+ * Parse out a charset from a content type header.
+ *
+ * @param contentType e.g. "text/html; charset=EUC-JP"
+ * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
+ */
+ private static String getCharsetFromContentType(String contentType) {
+ if (contentType == null) return null;
+
+ Matcher m = charsetPattern.matcher(contentType);
+ if (m.find()) {
+ return m.group(1).trim().toUpperCase();
+ }
+ return null;
+ }
}
View
8 src/test/java/org/jsoup/DataUtilTest.java → ...t/java/org/jsoup/helper/DataUtilTest.java
@@ -1,11 +1,11 @@
-package org.jsoup;
+package org.jsoup.helper;
+import static org.junit.Assert.assertEquals;
import org.junit.Test;
-import static org.junit.Assert.*;
-
public class DataUtilTest {
- @Test public void testCharset() {
+ @Test
+ public void testCharset() {
assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html;charset=utf-8 "));
assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset=UTF-8"));
assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1"));
View
93 src/test/java/org/jsoup/integration/ParseTest.java
@@ -3,86 +3,96 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Entities;
import org.jsoup.select.Elements;
+import static org.junit.Assert.*;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
-import static org.junit.Assert.*;
-
/**
-
- Integration test: parses from real-world example HTML.
-
- @author Jonathan Hedley, jonathan@hedley.net */
+ * Integration test: parses from real-world example HTML.
+ *
+ * @author Jonathan Hedley, jonathan@hedley.net
+ */
public class ParseTest {
- @Test public void testSmhBizArticle() throws IOException {
+ @Test
+ public void testSmhBizArticle() throws IOException {
File in = getFile("/htmltests/smh-biz-article-1.html");
- Document doc = Jsoup.parse(in, "UTF-8", "http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html");
- assertEquals("The board’s next fear: the female quota", doc.title()); // note that the apos in the source is a literal ’ (8217), not escaped or '
+ Document doc = Jsoup.parse(in, "UTF-8",
+ "http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html");
+ assertEquals("The board’s next fear: the female quota",
+ doc.title()); // note that the apos in the source is a literal ’ (8217), not escaped or '
assertEquals("en", doc.select("html").attr("xml:lang"));
Elements articleBody = doc.select(".articleBody > *");
assertEquals(17, articleBody.size());
// todo: more tests!
-
+
}
-
- @Test public void testNewsHomepage() throws IOException {
+
+ @Test
+ public void testNewsHomepage() throws IOException {
File in = getFile("/htmltests/news-com-au-home.html");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/");
assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title());
assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim());
-
+
Element a = doc.select("a[href=/entertainment/horoscopes]").first();
assertEquals("/entertainment/horoscopes", a.attr("href"));
assertEquals("http://www.news.com.au/entertainment/horoscopes", a.attr("abs:href"));
-
+
Element hs = doc.select("a[href*=naughty-corners-are-a-bad-idea]").first();
- assertEquals("http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003", hs.attr("href"));
+ assertEquals(
+ "http://www.heraldsun.com.au/news/naughty-corners-are-a-bad-idea-for-kids/story-e6frf7jo-1225817899003",
+ hs.attr("href"));
assertEquals(hs.attr("href"), hs.attr("abs:href"));
}
-
- @Test public void testGoogleSearchIpod() throws IOException {
+
+ @Test
+ public void testGoogleSearchIpod() throws IOException {
File in = getFile("/htmltests/google-ipod.html");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.google.com/search?hl=en&q=ipod&aq=f&oq=&aqi=g10");
assertEquals("ipod - Google Search", doc.title());
Elements results = doc.select("h3.r > a");
assertEquals(12, results.size());
- assertEquals("http://news.google.com/news?hl=en&q=ipod&um=1&ie=UTF-8&ei=uYlKS4SbBoGg6gPf-5XXCw&sa=X&oi=news_group&ct=title&resnum=1&ved=0CCIQsQQwAA",
- results.get(0).attr("href"));
+ assertEquals(
+ "http://news.google.com/news?hl=en&q=ipod&um=1&ie=UTF-8&ei=uYlKS4SbBoGg6gPf-5XXCw&sa=X&oi=news_group&ct=title&resnum=1&ved=0CCIQsQQwAA",
+ results.get(0).attr("href"));
assertEquals("http://www.apple.com/itunes/",
- results.get(1).attr("href"));
+ results.get(1).attr("href"));
}
-
- @Test public void testBinary() throws IOException {
+
+ @Test
+ public void testBinary() throws IOException {
File in = getFile("/htmltests/thumb.jpg");
Document doc = Jsoup.parse(in, "UTF-8");
// nothing useful, but did not blow up
assertTrue(doc.text().contains("gd-jpeg"));
}
-
- @Test public void testYahooJp() throws IOException {
+
+ @Test
+ public void testYahooJp() throws IOException {
File in = getFile("/htmltests/yahoo-jp.html");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html"); // http charset is utf-8.
assertEquals("Yahoo! JAPAN", doc.title());
Element a = doc.select("a[href=t/2322m2]").first();
- assertEquals("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/t/2322m2",
- a.attr("abs:href")); // session put into <base>
+ assertEquals("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/t/2322m2",
+ a.attr("abs:href")); // session put into <base>
assertEquals("全国、人気の駅ランキング", a.text());
}
-
- @Test public void testBaidu() throws IOException {
+
+ @Test
+ public void testBaidu() throws IOException {
// tests <meta http-equiv="Content-Type" content="text/html;charset=gb2312">
File in = getFile("/htmltests/baidu-cn-home.html");
- Document doc = Jsoup.parse(in, null, "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse
+ Document doc = Jsoup.parse(in, null,
+ "http://www.baidu.com/"); // http charset is gb2312, but NOT specifying it, to test http-equiv parse
Element submit = doc.select("#su").first();
assertEquals("百度一下", submit.attr("value"));
-
+
// test from attribute match
submit = doc.select("input[value=百度一下]").first();
assertEquals("su", submit.id());
@@ -94,38 +104,41 @@
assertEquals("\n<title>百度一下,你就知道 </title>", doc.select("title").outerHtml());
doc.outputSettings().charset("ascii");
- assertEquals("\n<title>&#30334;&#24230;&#19968;&#19979;&#65292;&#20320;&#23601;&#30693;&#36947; </title>", doc.select("title").outerHtml());
+ assertEquals("\n<title>&#30334;&#24230;&#19968;&#19979;&#65292;&#20320;&#23601;&#30693;&#36947; </title>",
+ doc.select("title").outerHtml());
}
-
- @Test public void testHtml5Charset() throws IOException {
+
+ @Test
+ public void testHtml5Charset() throws IOException {
// test that <meta charset="gb2312"> works
File in = getFile("/htmltests/meta-charset-1.html");
Document doc = Jsoup.parse(in, null, "http://example.com/"); //gb2312, has html5 <meta charset>
assertEquals("", doc.text());
assertEquals("GB2312", doc.outputSettings().charset().displayName());
-
+
// double check, no charset, falls back to utf8 which is incorrect
in = getFile("/htmltests/meta-charset-2.html"); //
doc = Jsoup.parse(in, null, "http://example.com"); // gb2312, no charset
assertEquals("UTF-8", doc.outputSettings().charset().displayName());
assertFalse("".equals(doc.text()));
-
+
// confirm fallback to utf8
in = getFile("/htmltests/meta-charset-3.html");
doc = Jsoup.parse(in, null, "http://example.com/"); // utf8, no charset
assertEquals("UTF-8", doc.outputSettings().charset().displayName());
assertEquals("", doc.text());
}
-
- @Test public void testNytArticle() throws IOException {
+
+ @Test
+ public void testNytArticle() throws IOException {
// has tags like <nyt_text>
File in = getFile("/htmltests/nyt-article-1.html");
Document doc = Jsoup.parse(in, null, "http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp");
-
+
Element headline = doc.select("nyt_headline[version=1.0]").first();
assertEquals("As BP Lays Out Future, It Will Not Include Hayward", headline.text());
}
-
+
File getFile(String resourceName) {
try {
File file = new File(ParseTest.class.getResource(resourceName).toURI());

0 comments on commit f737aa1

Please sign in to comment.