Initial implementation of Connection

btd · Aug 16, 2010 · f737aa1 · f737aa1
1 parent 6e5e8a2
commit f737aa1
Show file tree

Hide file tree

Showing 6 changed files with 244 additions and 126 deletions.
diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java
@@ -5,6 +5,7 @@
 import java.net.URL;
 import java.util.Map;
 import java.util.Collection;
+import java.io.IOException;
 
 /**
  * DRAFT interface to support HTTP connections.
@@ -36,11 +37,11 @@ public enum Method {
 
     public Connection cookie(String name, String value);
 
-    public Document get();
+    public Document get() throws IOException;
 
-    public Document post();
+    public Document post() throws IOException;
 
-    public Response execute();
+    public Response execute() throws IOException;
 
     public Request request();
 
@@ -97,6 +98,12 @@ public interface Request extends Base<Request> {
     public interface Response extends Base<Response> {
         public int statusCode();
 
+        public String statusMessage();
+
+        public String charset();
+
+        public Document parse();
+
         public String body();
 
         public byte[] bodyAsBytes();

diff --git a/src/main/java/org/jsoup/Jsoup.java b/src/main/java/org/jsoup/Jsoup.java
@@ -4,6 +4,8 @@
 import org.jsoup.parser.Parser;
 import org.jsoup.safety.Cleaner;
 import org.jsoup.safety.Whitelist;
+import org.jsoup.helper.DataUtil;
+import org.jsoup.helper.HttpConnection;
 
 import java.io.File;
 import java.io.IOException;
@@ -54,7 +56,13 @@ public static Document parse(String html) {
      the response stream.
      */
     public static Document parse(URL url, int timeoutMillis) throws IOException {
-        return DataUtil.load(url, timeoutMillis);
+        Connection con = HttpConnection.connect(url);
+        con.timeout(timeoutMillis / 1000);
+        return con.get();
+    }
+
+    public static Connection connect(String url) {
+        return HttpConnection.connect(url);
     }
 
     /**

diff --git a/src/main/java/org/jsoup/DataUtil.java → src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/DataUtil.java → src/main/java/org/jsoup/helper/DataUtil.java
@@ -1,12 +1,10 @@
-package org.jsoup;
+package org.jsoup.helper;
 
-import org.jsoup.helper.Validate;
+import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 
 import java.io.*;
-import java.net.HttpURLConnection;
-import java.net.URL;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.util.regex.Matcher;
@@ -16,9 +14,9 @@
  * Internal static utilities for handling data.
  *
  */
-class DataUtil {
+public class DataUtil {
     private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=([^\\s;]*)");
-    private static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
+    static final String defaultCharset = "UTF-8"; // used if not found in header or meta charset
     private static final int bufferSize = 0x20000; // ~130K.
 
     /**
@@ -28,60 +26,17 @@ class DataUtil {
      * @return
      * @throws IOException
      */
-    static Document load(File in, String charsetName, String baseUri) throws IOException {
+    public static Document load(File in, String charsetName, String baseUri) throws IOException {
         InputStream inStream = new FileInputStream(in);
-        Document doc = readInputStream(inStream, charsetName, baseUri);
-        inStream.close();
-        return doc;
-    }
-
-    /**
-     Fetches a URL and gets as a string.
-     @param url
-     @param timeoutMillis
-     @return
-     @throws IOException
-     */
-    static Document load(URL url, int timeoutMillis) throws IOException {
-        String protocol = url.getProtocol();
-        Validate.isTrue(protocol.equals("http") || protocol.equals("https"), "Only http & https protocols supported");
-
-        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
-        conn.setInstanceFollowRedirects(true);
-        conn.setConnectTimeout(timeoutMillis);
-        conn.setReadTimeout(timeoutMillis);
-        conn.connect();
-
-        int res = conn.getResponseCode();
-        if (res != HttpURLConnection.HTTP_OK)
-            throw new IOException(res + " error loading URL " + url.toString());
-
-        String contentType = conn.getContentType();
-        if (contentType == null || !contentType.startsWith("text/"))
-            throw new IOException(String.format("Unhandled content type \"%s\" on URL %s. Must be text/*", 
-                    contentType, url.toString()));
-
-        InputStream inStream = new BufferedInputStream(conn.getInputStream());
-        String charSet = getCharsetFromContentType(contentType); // may be null, readInputStream deals with it
-
-        Document doc = readInputStream(inStream, charSet, url.toExternalForm());
+        ByteBuffer byteData = readToByteBuffer(inStream);
+        Document doc = parseByteData(byteData, charsetName, baseUri);
         inStream.close();
         return doc;
     }
 
     // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
     // switching the chartset midstream when a meta http-equiv tag defines the charset.
-    private static Document readInputStream(InputStream inStream, String charsetName, String baseUri) throws IOException {
-        byte[] buffer = new byte[bufferSize];
-        ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
-        int read;
-        while(true) {
-            read  = inStream.read(buffer);
-            if (read == -1) break;
-            outStream.write(buffer, 0, read);
-        }
-        ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
-
+    static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri) {
         String docData;
         Document doc = null;
         if (charsetName == null) { // determine from meta. safe parse as UTF-8
@@ -107,7 +62,20 @@ private static Document readInputStream(InputStream inStream, String charsetName
         }
         return doc;
     }
-
+
+    static ByteBuffer readToByteBuffer(InputStream inStream) throws IOException {
+        byte[] buffer = new byte[bufferSize];
+        ByteArrayOutputStream outStream = new ByteArrayOutputStream(bufferSize);
+        int read;
+        while(true) {
+            read  = inStream.read(buffer);
+            if (read == -1) break;
+            outStream.write(buffer, 0, read);
+        }
+        ByteBuffer byteData = ByteBuffer.wrap(outStream.toByteArray());
+        return byteData;
+    }
+
     /**
      * Parse out a charset from a content type header.
      * @param contentType e.g. "text/html; charset=EUC-JP"