From e73e9d9eb95edb048c27d1d030421babe17e644e Mon Sep 17 00:00:00 2001
From: Chris DeRose <cderose@derosetechnologies.com>
Date: Sun, 5 Jul 2009 20:39:57 +0000
Subject: [PATCH] Added the code_tests tasks. Refactored scraper to accomodate
 the suggestions, broke each objct into its on rb file in lib

---
 CHANGELOG             |   3 +
 Rakefile              |  43 ++-
 TODO.txt              |  54 ++++
 lib/geo_listings.rb   |  46 ++++
 lib/libcraigscrape.rb | 625 +-----------------------------------------
 lib/listings.rb       | 137 +++++++++
 lib/posting.rb        | 291 ++++++++++++++++++++
 lib/scraper.rb        | 174 ++++++++++++
 roodi.yml             |  15 +
 9 files changed, 765 insertions(+), 623 deletions(-)
 create mode 100644 TODO.txt
 create mode 100644 lib/geo_listings.rb
 create mode 100644 lib/listings.rb
 create mode 100644 lib/posting.rb
 create mode 100644 lib/scraper.rb
 create mode 100644 roodi.yml

diff --git a/CHANGELOG b/CHANGELOG
index 80a7300..2fab6a6 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,8 @@
 == Change Log
 
+=== Release 0.8.0 (TODO, 2009)
+- Added :code_tests to the rakefile
+
 === Release 0.7.0 (Jul 5, 2009)
 - A good bit of refactoring
 - Eager-loading in the Post object without the need of the full_post method
diff --git a/Rakefile b/Rakefile
index e486980..285e6c6 100644
--- a/Rakefile
+++ b/Rakefile
@@ -11,7 +11,7 @@ include FileUtils
 RbConfig = Config unless defined? RbConfig
 
 NAME = "libcraigscrape"
-VERS = ENV['VERSION'] || "0.7.0"
+VERS = ENV['VERSION'] || "0.8.0"
 PKG = "#{NAME}-#{VERS}"
 
 RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source']
@@ -77,3 +77,44 @@ task :uninstall => [:clean] do
   sh %{sudo gem uninstall #{NAME}}
 end
 
+require 'roodi'
+require 'roodi_task'
+
+namespace :code_tests do
+  desc "Analyze for code complexity"
+  task :flog do
+    require 'flog'
+
+    flog = Flog.new
+    flog.flog_files ['lib']
+    threshold = 105
+  
+    bad_methods = flog.totals.select do |name, score|
+       score > threshold
+    end
+  
+    bad_methods.sort { |a,b| a[1] <=> b[1] }.each do |name, score|
+      puts "%8.1f: %s" % [score, name]
+    end
+  
+    puts "WARNING : #{bad_methods.size} methods have a flog complexity > #{threshold}" unless bad_methods.empty?
+  end
+  
+  desc "Analyze for code duplication"
+    require 'flay'
+    task :flay do
+    threshold = 25
+    flay = Flay.new({:fuzzy => false, :verbose => false, :mass => threshold})
+    flay.process(*Flay.expand_dirs_to_files(['lib']))
+  
+    flay.report
+  
+    raise "#{flay.masses.size} chunks of code have a duplicate mass > #{threshold}" unless flay.masses.empty?
+  end
+  
+  RoodiTask.new 'roodi', ['lib/*.rb'], 'roodi.yml'
+end
+
+desc "Run all code tests"
+task :code_tests => %w(code_tests:flog code_tests:flay code_tests:roodi)
+
diff --git a/TODO.txt b/TODO.txt
new file mode 100644
index 0000000..b79b3ba
--- /dev/null
+++ b/TODO.txt
@@ -0,0 +1,54 @@
+0.8.0 TODO: 
+ * It'd be nice to let the yaml's not need a full path to the .db, just use dir(__FILE__) as cwd for that
+ * We should hve a listings.next_page which returns the next page - that would clean up our while loop a bit
+ * Change craigwatch's default non-regex search to be case insensitive
+ * Reduce memory consumption in craigwatch.
+ * I think we need to update the package to include the enw rake tasks for flogging
+ * Add some rdoc text to the top of all the new lib files...
+   
+Post-0.7:
+ * A debug_craigwatch, which shows the current progress... (pages fetched, objects in caches..)
+ * Some pages are legitimate 404's and we just can't parse them no matter how hard we try - what to do about this?
+ * Break the scraper objects into separate files...
+ * Maybe we should make an instance out of CraigScrape.new('us/fl/south florida') kind of thing..
+ * Finsih testing out that geo location todo list
+ 	* Test out that array-paramter to the GeoListings constructor, make sure it actually works
+	* integrate it better nito craigscrape
+	* It'd be nice to tell craigscrape 'us/ca' or 'us/ca/losangeles' as the scrape location
+	* and maybe have 'search text" and "search section" type stuff where everything ends up scraping from there..
+	* We should really cache pages if we're going to do this - and I'd say to cache the geolisting pages first...
+ * Stats in the email: bytes transferred, generation time, urls scrapped, posts scrapped
+ 
+ * It'd also be nice to run an erb over the yaml file? No, we should take some steps to DRY out the code though. 
+	* Particularly with respect to the searches which the same regex for multiple searches.
+	* and particularly with those searches which are usingt he same listings urls to search for different things (IE 'cta' searches)
+
+Recheks in a week (5.11.09 was last tried)
+
+	* This thread:
+	http://sfbay.craigslist.org/forums/?ID=29345737
+	Title: craigwatch does this - if you're a little handy 
+	Message: 
+		craigwatch and libcraigscrape are a tightly-coupled, ruby solution for (largely) unix-based systems. 
+		<br>
+		<br>
+		Check it out here: 
+		<br>
+		<a target="_top" href="http://www.derosetechnologies.com/community/libcraigscrape">http://www.derosetechnologies.com/community/libcraigscrape</a>
+	* http://www.craigslistwatch.com/
+	* Did this actuallyt post?: http://digg.com/tech_news/Stop_wasting_money_use_Craigslist_Watch
+
+email: 
+
+	http://www.dostuffright.com/Craigwatch
+	http://wareseeker.com/Network-Internet/Craigslist-All-City-Search-Tool-1.2.zip/8036652
+	http://www.killerstartups.com/Search/craigslittlebuddy-com-multiple-city-craigslist-search
+	
+Scripts aggregators:
+	  bigwebmaster.com 
+	http://www.scripts.com/
+	http://www.scriptarchive.com/
+	http://www.needscripts.com/
+	http://www.scriptsearch.com/
+	http://www.sitescripts.com/PHP/
+	http://www.scriptsbank.com/
diff --git a/lib/geo_listings.rb b/lib/geo_listings.rb
new file mode 100644
index 0000000..b50bb7e
--- /dev/null
+++ b/lib/geo_listings.rb
@@ -0,0 +1,46 @@
+# TODO: file rdoc
+
+require 'scraper'
+
+class CraigScrape
+  # GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us]) 
+  # These list all the craigslist sites in a given region.
+  class GeoListings < Scraper
+    LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
+    GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
+  
+    # The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'. 
+    # In addition though, here we'll accept an array like %w(us fl) which gets converted to
+    # {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
+    def initialize(init_via = nil)
+      super init_via.kind_of?(Array) ? "#{GEOLISTING_BASE_URL}#{init_via.join '/'}" : init_via
+      
+      # Validate that required fields are present, at least - if we've downloaded it from a url
+      parse_error! unless location
+    end
+  
+    # Returns the GeoLocation's full name
+    def location
+      unless @name
+        cursor = html % 'h3 > b > a:first-of-type'
+        cursor = cursor.next_node if cursor       
+        @name = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
+      end
+      
+      @name
+    end
+  
+    # Returns a hash of site name to urls in the current listing
+    def sites
+      unless @sites
+        @sites = {}
+        (html / 'div#list > a').each do |el_a|
+          site_name = he_decode strip_html(el_a.inner_html)
+          @sites[site_name] = el_a[:href]
+        end
+      end
+      
+      @sites
+    end
+  end
+end
\ No newline at end of file
diff --git a/lib/libcraigscrape.rb b/lib/libcraigscrape.rb
index 7632ad8..0cced26 100644
--- a/lib/libcraigscrape.rb
+++ b/lib/libcraigscrape.rb
@@ -2,13 +2,10 @@
 #
 # All of libcraigscrape's objects and methods are loaded when you use <tt>require 'libcraigscrape'</tt> in your code.
 #
-require 'net/http'
-require 'zlib'
 
-require 'rubygems'
-require 'hpricot'
-require 'htmlentities'
-require 'activesupport'
+require 'listings'
+require 'posting'
+require 'geo_listings'
 
 # A base class encapsulating the libcraigscrape objects, and providing some utility methods.
 class CraigScrape
@@ -78,620 +75,4 @@ def self.most_recently_expired_time(month, day)  #:nodoc:
     ret
   end
 
-  # Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related 
-  # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
-  # methods. It also contains the http-related cattr_accessors:
-  # 
-  # *logger* - a Logger object to debug http notices too. Defaults to nil
-  #
-  # *retries_on_fetch_fail* - The number of times to retry a failed uri download. Defaults to 4
-  #
-  # *sleep_between_fetch_retries* - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 15.
-  class Scraper
-    cattr_accessor :logger
-    cattr_accessor :sleep_between_fetch_retries
-    cattr_accessor :retries_on_fetch_fail
-
-    URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
-    HTML_TAG  = /<\/?[^>]*>/
-    
-    # Returns the full url that corresponds to this resource
-    attr_reader :url
-
-    # Set some defaults:
-    self.retries_on_fetch_fail = 4
-    self.sleep_between_fetch_retries = 15
-  
-    class BadConstructionError < StandardError #:nodoc:
-    end
-  
-    class ParseError < StandardError #:nodoc:
-    end
-  
-    class BadUrlError < StandardError #:nodoc:
-    end
-  
-    class FetchError < StandardError #:nodoc:
-    end
-    
-    # Scraper Objects can be created from either a full URL (string), or a Hash.
-    # Currently, this initializer isn't intended to be called from libcraigslist API users, though
-    # if you know what you're doing - feel free to try this out.
-    #
-    # A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
-    #
-    # When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
-    # This is useful to create an object without actually making an html request, this is used to set-up an
-    # object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
-    # you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
-    # Otherwise this will fail to eager load.
-    def initialize(init_via = nil)
-      if init_via.nil?
-        # Do nothing - possibly not a great idea, but we'll allow it
-      elsif init_via.kind_of? String
-        @url = init_via
-      elsif init_via.kind_of? Hash
-        init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
-      else
-        raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
-      end
-    end
-    
-    # Indicates whether the resource has yet been retrieved from its associated url.
-    # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
-    # but hasn't yet been fetched.
-    def downloaded?; !@html.nil?; end
-
-    # A URI object corresponding to this Scraped URL
-    def uri
-      @uri ||= URI.parse @url if @url
-      @uri
-    end
-
-    private
-    
-    # Returns text with all html tags removed.
-    def strip_html(str)
-      str.gsub HTML_TAG, "" if str
-    end
-    
-    # Easy way to fail noisily:
-    def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
-    
-    # Returns text with all html entities converted to respective ascii character.
-    def he_decode(text); self.class.he_decode text; end
-
-    # Returns text with all html entities converted to respective ascii character.
-    def self.he_decode(text); HTMLEntities.new.decode text; end
-    
-    # Derives a full url, using the current object's url and the provided href
-    def url_from_href(href) #:nodoc:
-      scheme, host, path = $1, $2, $3 if URL_PARTS.match href
-
-      scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
-
-      host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
-
-      path = (
-        (/\/$/.match(uri.path)) ?
-          '%s%s'  % [uri.path,path] :
-          '%s/%s' % [File.dirname(uri.path),path]
-      ) unless /^\//.match path
-
-      '%s://%s%s' % [scheme, host, path]
-    end
-    
-    def fetch_uri(uri)
-  
-      logger.info "Requesting: %s" % @url if logger
-  
-      case uri.scheme
-        when 'file'
-          File.read uri.path
-        when /^http[s]?/
-          fetch_attempts = 0
-          
-          begin
-            # This handles the redirects for us          
-            resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
-        
-            if resp.response.code == "200"
-              # Check for gzip, and decode:
-              data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
-              
-              data
-            elsif resp.response['Location']
-              redirect_to = resp.response['Location']
-              
-              fetch_uri URI.parse(url_from_href(redirect_to))
-            else
-              # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
-              error_description = 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
-      
-              logger.info error_description if logger
-              
-              raise FetchError, error_description
-            end
-          rescue FetchError,Timeout::Error,Errno::ECONNRESET => err
-            logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
-            logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
-            
-            fetch_attempts += 1
-
-            if fetch_attempts <= self.retries_on_fetch_fail
-              sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
-              logger.info 'Retrying fetch ....' if logger
-              retry
-            else
-              raise err
-            end
-          end
-        else
-          raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
-      end
-    end
-    
-    def html
-      @html ||= Hpricot.parse fetch_uri(uri) if uri
-      @html
-    end
-  end
-
-  # Posting represents a fully downloaded, and parsed, Craigslist post.
-  # This class is generally returned by the listing scrape methods, and 
-  # contains the post summaries for a specific search url, or a general listing category 
-  class Posting < Scraper
-    
-    POST_DATE       = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
-    LOCATION        = /Location\:[ ]+(.+)/
-    HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
-    POSTING_ID      = /PostingID\:[ ]+([\d]+)/
-    REPLY_TO        = /(.+)/
-    PRICE           = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
-    USERBODY_PARTS  = /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m
-    IMAGE_SRC       = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
-
-    # This is really just for testing, in production use, uri.path is a better solution
-    attr_reader :href #:nodoc:
-
-    # Create a new Post via a url (String), or supplied parameters (Hash)
-    def initialize(*args)
-      super(*args)
-
-      # Validate that required fields are present, at least - if we've downloaded it from a url
-      parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
-        contents,posting_id,post_time,header,title,full_section
-      ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
-    end
-        
-
-    # String, The contents of the item's html body heading
-    def header
-      unless @header
-        h2 = html.at 'h2' if html
-        @header = he_decode h2.inner_html if h2
-      end
-      
-      @header
-    end
-    
-    # String, the item's title
-    def title
-      unless @title
-        title_tag = html.at 'title' if html
-        @title = he_decode title_tag.inner_html if title_tag
-        @title = nil if @title and @title.length == 0
-      end
-    
-      @title
-    end
-
-    # Array, hierarchial representation of the posts section
-    def full_section
-      unless @full_section
-        @full_section = []
-        
-        (html/"div[@class='bchead']//a").each do |a|
-          @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
-        end if html
-      end
-
-      @full_section
-    end
-
-    # String, represents the post's reply-to address, if listed
-    def reply_to
-      unless @reply_to
-        cursor = html.at 'hr' if html
-        cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a'
-        @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
-      end
-      
-      @reply_to
-    end
-    
-    # Time, reflects the full timestamp of the posting 
-    def post_time
-      unless @post_time
-        cursor = html.at 'hr' if html
-        cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s
-        @post_time = Time.parse $1 if $1
-      end
-      
-      @post_time
-    end
-
-    # Integer, Craigslist's unique posting id
-    def posting_id
-      unless @posting_id
-        cursor = (html/"#userbody").first if html
-        cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s
-        @posting_id = $1.to_i if $1
-      end
-    
-      @posting_id
-    end
-    
-    # String, The full-html contents of the post
-    def contents
-      unless @contents
-        @contents = user_body if html
-        @contents = he_decode @contents.strip if @contents
-      end
-      
-      @contents
-    end
-    
-    # String, the location of the item, as best could be parsed
-    def location
-      if @location.nil? and craigslist_body and html
-        # Location (when explicitly defined):
-        cursor = craigslist_body.at 'ul' unless @location
-        
-        # Apa section includes other things in the li's (cats/dogs ok fields)
-        cursor.children.each do |li|
-          if LOCATION.match li.inner_html
-            @location = he_decode($1) and break
-            break
-          end
-        end if cursor
-
-        # Real estate listings can work a little different for location:
-        unless @location
-          cursor = craigslist_body.at 'small'
-          cursor = cursor.previous_node until cursor.nil? or cursor.text?
-          
-          @location = he_decode(cursor.to_s.strip) if cursor
-        end
-        
-        # So, *sometimes* the location just ends up being in the header, I don't know why:
-        @location = $1 if @location.nil? and HEADER_LOCATION.match header
-      end
-      
-      @location
-    end
-
-    # Array, urls of the post's images that are *not* hosted on craigslist
-    def images
-      # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
-      @images = ( 
-        contents ? 
-          contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
-          [] 
-      ) unless @images
-      
-      @images
-    end
-
-    # Array, urls of the post's craigslist-hosted images
-    def pics
-      unless @pics
-        @pics = []
-        
-        if html and craigslist_body
-          # Now let's find the craigslist hosted images:
-          img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
-        
-          @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
-        end
-      end
-      
-      @pics
-    end
-
-    # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
-    def flagged_for_removal?
-      @flagged_for_removal = (
-        system_post? and header_as_plain == "This posting has been flagged for removal"
-      ) if @flagged_for_removal.nil?
-      
-      @flagged_for_removal
-    end
-    
-    # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
-    def deleted_by_author?
-      @deleted_by_author = (
-        system_post? and header_as_plain == "This posting has been deleted by its author."
-      ) if @deleted_by_author.nil?
-      
-      @deleted_by_author
-    end
-    
-    
-    # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
-    # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
-    def post_date
-      @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
-      
-      @post_date
-    end
-    
-    # Returns The post label. The label would appear at first glance to be indentical to the header - but its not. 
-    # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
-    # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
-    # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
-    # in a full page load from the post's url.
-    def label
-      unless @label or system_post?
-        @label = header
-        
-        @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
-      end
-      
-      @label
-    end
-
-    # Array, which image types are listed for the post.
-    # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
-    def img_types
-      unless @img_types
-        @img_types = []
-        
-        @img_types << :img if images.length > 0
-        @img_types << :pic if pics.length > 0
-      end
-      
-      @img_types
-    end
-    
-    # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However, 
-    # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
-    def section
-      unless @section
-        @section = full_section.last if full_section  
-      end
-      
-      @section
-    end
-
-    # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server. 
-    # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
-    def has_img?
-      img_types.include? :img
-    end
-
-    # true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
-    # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
-    def has_pic?
-      img_types.include? :pic
-    end
-
-    # true if post summary has either the img or pic label
-    # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
-    def has_pic_or_img?
-      img_types.length > 0
-    end
-
-    # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
-    # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
-    def price
-      $1.tr('$','').to_f if label and PRICE.match label
-    end
-    
-    # Returns the post contents with all html tags removed
-    def contents_as_plain
-      strip_html contents
-    end
-
-    # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a 
-    # 'system_post' we may get tags in here
-    def header_as_plain
-      strip_html header
-    end
-
-    # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original 
-    # This returns true or false if that case applies
-    def system_post?
-      [contents,posting_id,post_time,title].all?{|f| f.nil?}
-    end
-
-    private
-
-    # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
-    # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
-    # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
-    def user_body     
-      $1 if USERBODY_PARTS.match html.to_s
-    end
-    
-    # Read the notes on user_body. However,  unlike the user_body, the craigslist portion of this div can be relied upon to be valid html. 
-    # So - we'll return it as an Hpricot object.
-    def craigslist_body
-      Hpricot.parse $2 if USERBODY_PARTS.match html.to_s
-    end
-
-  end
-
-  # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
-  class Listings < Scraper
-    LABEL          = /^(.+?)[ ]*\-$/
-    LOCATION       = /^[ ]*\((.*?)\)$/
-    IMG_TYPE       = /^[ ]*(.+)[ ]*$/
-    HEADER_DATE    = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
-    SUMMARY_DATE   = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
-    NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
-
-    # Array, PostSummary objects found in the listing
-    def posts
-      unless @posts
-        current_date = nil
-        @posts = []
-  
-        post_tags = html.get_elements_by_tag_name('p','h4')
-        
-        # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
-        post_tags.pop if (
-          post_tags.length > 0 and 
-          post_tags.last.at('a') and 
-          NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
-        )
-        
-        # Now we iterate though the listings:
-        post_tags.each do |el|
-          case el.name
-            when 'p'
-             post_summary = self.class.parse_summary el, current_date
-             
-             # Validate that required fields are present:
-             parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
-      
-             post_summary[:url] = url_from_href post_summary[:href]
-
-             @posts << CraigScrape::Posting.new(post_summary)
-           when 'h4'
-            # Let's make sense of the h4 tag, and then read all the p tags below it
-            if HEADER_DATE.match he_decode(el.inner_html)
-              # Generally, the H4 tags contain valid dates. When they do - this is easy:
-              current_date = CraigScrape.most_recently_expired_time $1, $2
-            elsif html.at('h4:last-of-type') == el
-              # There's a specific bug, where these nonsense h4's just appear without anything relevant inside them.
-              # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page, 
-              # we need to pull up the full post in order to accurate tell the date.
-              # Setting this to nil will achieve the eager-load.
-              current_date = nil
-            end
-          end        
-        end        
-      end
-
-      @posts
-    end
-
-    # String, URL Path href-fragment of the next page link
-    def next_page_href
-      unless @next_page_href
-        cursor = html.at 'p:last-of-type'
-        
-        cursor = cursor.at 'a' if cursor
-        
-        # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag 
-        next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
-
-        # Search listings put their next page in a link towards the top
-        next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
-                
-        # Some search pages have a bug, whereby a 'next page' link isn't displayed,
-        # even though we can see that theres another page listed in the page-number links block at the top
-        # and bottom of the listing page
-        unless next_link
-          cursor = html % 'div.sh:first-of-type > b:last-of-type'
-
-          # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
-          # We're looking good.
-          next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html
-        end
-        
-        # We have an anchor tag - so - let's assign the href:
-        @next_page_href = next_link[:href] if next_link
-      end
-      
-      @next_page_href
-    end
-    
-    # String, Full URL Path of the 'next page' link
-    def next_page_url
-      (next_page_href) ? url_from_href(next_page_href) : nil
-    end
-    
-    # Takes a paragraph element and returns a mostly-parsed Posting
-    # We separate this from the rest of the parsing both for readability and ease of testing
-    def self.parse_summary(p_element, date = nil)  #:nodoc:
-      ret = {}
-      
-      title_anchor, section_anchor  = p_element.search 'a'
-      location_tag = p_element.at 'font'
-      has_pic_tag = p_element.at 'span'
-      
-      href = nil
-      
-      location = he_decode p_element.at('font').inner_html if location_tag
-      ret[:location] = $1 if location and LOCATION.match location
-  
-      ret[:img_types] = []
-      if has_pic_tag
-        img_type = he_decode has_pic_tag.inner_html
-        img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
-  
-        ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
-      end
-  
-      ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
-      
-      ret[:post_date] = date
-      if SUMMARY_DATE.match he_decode(p_element.children[0])
-        ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
-      end
-  
-      if title_anchor
-        label = he_decode title_anchor.inner_html
-        ret[:label] = $1 if LABEL.match label
-    
-        ret[:href] = title_anchor[:href]
-      end
-      
-      ret
-    end
-  end
-  
-  # GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us]) 
-  # These list all the craigslist sites in a given region.
-  class GeoListings < Scraper
-    LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/
-    GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/}
-
-    # The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'. 
-    # In addition though, here we'll accept an array like %w(us fl) which gets converted to
-    # {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl]
-    def initialize(init_via = nil)
-      super init_via.kind_of?(Array) ? "#{GEOLISTING_BASE_URL}#{init_via.join '/'}" : init_via
-      
-      # Validate that required fields are present, at least - if we've downloaded it from a url
-      parse_error! unless location
-    end
-
-    # Returns the GeoLocation's full name
-    def location
-      unless @name
-        cursor = html % 'h3 > b > a:first-of-type'
-        cursor = cursor.next_node if cursor       
-        @name = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s)
-      end
-      
-      @name
-    end
-
-    # Returns a hash of site name to urls in the current listing
-    def sites
-      unless @sites
-        @sites = {}
-        (html / 'div#list > a').each do |el_a|
-          site_name = he_decode strip_html(el_a.inner_html)
-          @sites[site_name] = el_a[:href]
-        end
-      end
-      
-      @sites
-    end
-  end
-
 end
\ No newline at end of file
diff --git a/lib/listings.rb b/lib/listings.rb
new file mode 100644
index 0000000..a7a63d5
--- /dev/null
+++ b/lib/listings.rb
@@ -0,0 +1,137 @@
+# TODO: file rdoc
+
+require 'scraper'
+
+class CraigScrape
+
+  # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing
+  class Listings < Scraper
+    LABEL          = /^(.+?)[ ]*\-$/
+    LOCATION       = /^[ ]*\((.*?)\)$/
+    IMG_TYPE       = /^[ ]*(.+)[ ]*$/
+    HEADER_DATE    = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/
+    SUMMARY_DATE   = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/
+    NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/
+
+    # Array, PostSummary objects found in the listing
+    def posts
+      unless @posts
+        current_date = nil
+        @posts = []
+  
+        post_tags = html.get_elements_by_tag_name('p','h4')
+        
+        # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output:
+        post_tags.pop if (
+          post_tags.length > 0 and 
+          post_tags.last.at('a') and 
+          NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html
+        )
+        
+        # Now we iterate though the listings:
+        post_tags.each do |el|
+          case el.name
+            when 'p'
+             post_summary = self.class.parse_summary el, current_date
+             
+             # Validate that required fields are present:
+             parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0}
+      
+             post_summary[:url] = url_from_href post_summary[:href]
+
+             @posts << CraigScrape::Posting.new(post_summary)
+           when 'h4'
+            # Let's make sense of the h4 tag, and then read all the p tags below it
+            if HEADER_DATE.match he_decode(el.inner_html)
+              # Generally, the H4 tags contain valid dates. When they do - this is easy:
+              current_date = CraigScrape.most_recently_expired_time $1, $2
+            elsif html.at('h4:last-of-type') == el
+              # There's a specific bug, where these nonsense h4's just appear without anything relevant inside them.
+              # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page, 
+              # we need to pull up the full post in order to accurate tell the date.
+              # Setting this to nil will achieve the eager-load.
+              current_date = nil
+            end
+          end        
+        end        
+      end
+
+      @posts
+    end
+
+    # String, URL Path href-fragment of the next page link
+    def next_page_href
+      unless @next_page_href
+        cursor = html.at 'p:last-of-type'
+        
+        cursor = cursor.at 'a' if cursor
+        
+        # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag 
+        next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html
+
+        # Search listings put their next page in a link towards the top
+        next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == '<b>Next>></b>' } unless next_link
+                
+        # Some search pages have a bug, whereby a 'next page' link isn't displayed,
+        # even though we can see that theres another page listed in the page-number links block at the top
+        # and bottom of the listing page
+        unless next_link
+          cursor = html % 'div.sh:first-of-type > b:last-of-type'
+
+          # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise
+          # We're looking good.
+          next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html
+        end
+        
+        # We have an anchor tag - so - let's assign the href:
+        @next_page_href = next_link[:href] if next_link
+      end
+      
+      @next_page_href
+    end
+    
+    # String, Full URL Path of the 'next page' link
+    def next_page_url
+      (next_page_href) ? url_from_href(next_page_href) : nil
+    end
+    
+    # Takes a paragraph element and returns a mostly-parsed Posting
+    # We separate this from the rest of the parsing both for readability and ease of testing
+    def self.parse_summary(p_element, date = nil)  #:nodoc:
+      ret = {}
+      
+      title_anchor, section_anchor  = p_element.search 'a'
+      location_tag = p_element.at 'font'
+      has_pic_tag = p_element.at 'span'
+      
+      href = nil
+      
+      location = he_decode p_element.at('font').inner_html if location_tag
+      ret[:location] = $1 if location and LOCATION.match location
+  
+      ret[:img_types] = []
+      if has_pic_tag
+        img_type = he_decode has_pic_tag.inner_html
+        img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type
+  
+        ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym}
+      end
+  
+      ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor
+      
+      ret[:post_date] = date
+      if SUMMARY_DATE.match he_decode(p_element.children[0])
+        ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i
+      end
+  
+      if title_anchor
+        label = he_decode title_anchor.inner_html
+        ret[:label] = $1 if LABEL.match label
+    
+        ret[:href] = title_anchor[:href]
+      end
+      
+      ret
+    end
+  end
+end
\ No newline at end of file
diff --git a/lib/posting.rb b/lib/posting.rb
new file mode 100644
index 0000000..24fa08a
--- /dev/null
+++ b/lib/posting.rb
@@ -0,0 +1,291 @@
+# TODO: file rdoc
+
+require 'scraper'
+
+class CraigScrape
+
+  # Posting represents a fully downloaded, and parsed, Craigslist post.
+  # This class is generally returned by the listing scrape methods, and 
+  # contains the post summaries for a specific search url, or a general listing category 
+  class Posting < Scraper
+    
+    POST_DATE       = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i
+    LOCATION        = /Location\:[ ]+(.+)/
+    HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/
+    POSTING_ID      = /PostingID\:[ ]+([\d]+)/
+    REPLY_TO        = /(.+)/
+    PRICE           = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/
+    USERBODY_PARTS  = /\<div id\=\"userbody\">(.+)\<br[ ]*[\/]?\>\<br[ ]*[\/]?\>(.+)\<\/div\>/m
+    IMAGE_SRC       = /\<im[a]?g[e]?[^\>]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/
+
+    # This is really just for testing, in production use, uri.path is a better solution
+    attr_reader :href #:nodoc:
+
+    # Create a new Post via a url (String), or supplied parameters (Hash)
+    def initialize(*args)
+      super(*args)
+
+      # Validate that required fields are present, at least - if we've downloaded it from a url
+      parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [
+        contents,posting_id,post_time,header,title,full_section
+      ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)}
+    end
+        
+
+    # String, The contents of the item's html body heading
+    def header
+      unless @header
+        h2 = html.at 'h2' if html
+        @header = he_decode h2.inner_html if h2
+      end
+      
+      @header
+    end
+    
+    # String, the item's title
+    def title
+      unless @title
+        title_tag = html.at 'title' if html
+        @title = he_decode title_tag.inner_html if title_tag
+        @title = nil if @title and @title.length == 0
+      end
+    
+      @title
+    end
+
+    # Array, hierarchial representation of the posts section
+    def full_section
+      unless @full_section
+        @full_section = []
+        
+        (html/"div[@class='bchead']//a").each do |a|
+          @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef'
+        end if html
+      end
+
+      @full_section
+    end
+
+    # String, represents the post's reply-to address, if listed
+    def reply_to
+      unless @reply_to
+        cursor = html.at 'hr' if html
+        cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a'
+        @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html)
+      end
+      
+      @reply_to
+    end
+    
+    # Time, reflects the full timestamp of the posting 
+    def post_time
+      unless @post_time
+        cursor = html.at 'hr' if html
+        cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s
+        @post_time = Time.parse $1 if $1
+      end
+      
+      @post_time
+    end
+
+    # Integer, Craigslist's unique posting id
+    def posting_id
+      unless @posting_id
+        cursor = (html/"#userbody").first if html
+        cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s
+        @posting_id = $1.to_i if $1
+      end
+    
+      @posting_id
+    end
+    
+    # String, The full-html contents of the post
+    def contents
+      unless @contents
+        @contents = user_body if html
+        @contents = he_decode @contents.strip if @contents
+      end
+      
+      @contents
+    end
+    
+    # String, the location of the item, as best could be parsed
+    def location
+      if @location.nil? and craigslist_body and html
+        # Location (when explicitly defined):
+        cursor = craigslist_body.at 'ul' unless @location
+        
+        # Apa section includes other things in the li's (cats/dogs ok fields)
+        cursor.children.each do |li|
+          if LOCATION.match li.inner_html
+            @location = he_decode($1) and break
+            break
+          end
+        end if cursor
+
+        # Real estate listings can work a little different for location:
+        unless @location
+          cursor = craigslist_body.at 'small'
+          cursor = cursor.previous_node until cursor.nil? or cursor.text?
+          
+          @location = he_decode(cursor.to_s.strip) if cursor
+        end
+        
+        # So, *sometimes* the location just ends up being in the header, I don't know why:
+        @location = $1 if @location.nil? and HEADER_LOCATION.match header
+      end
+      
+      @location
+    end
+
+    # Array, urls of the post's images that are *not* hosted on craigslist
+    def images
+      # Keep in mind that when users post html to craigslist, they're often not posting wonderful html...
+      @images = ( 
+        contents ? 
+          contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } :
+          [] 
+      ) unless @images
+      
+      @images
+    end
+
+    # Array, urls of the post's craigslist-hosted images
+    def pics
+      unless @pics
+        @pics = []
+        
+        if html and craigslist_body
+          # Now let's find the craigslist hosted images:
+          img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'}
+        
+          @pics = (img_table / 'img').collect{|i| i[:src]} if img_table
+        end
+      end
+      
+      @pics
+    end
+
+    # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page
+    def flagged_for_removal?
+      @flagged_for_removal = (
+        system_post? and header_as_plain == "This posting has been flagged for removal"
+      ) if @flagged_for_removal.nil?
+      
+      @flagged_for_removal
+    end
+    
+    # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice
+    def deleted_by_author?
+      @deleted_by_author = (
+        system_post? and header_as_plain == "This posting has been deleted by its author."
+      ) if @deleted_by_author.nil?
+      
+      @deleted_by_author
+    end
+    
+    
+    # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely
+    # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
+    def post_date
+      @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil?
+      
+      @post_date
+    end
+    
+    # Returns The post label. The label would appear at first glance to be indentical to the header - but its not. 
+    # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location.
+    # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label
+    # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result
+    # in a full page load from the post's url.
+    def label
+      unless @label or system_post?
+        @label = header
+        
+        @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label
+      end
+      
+      @label
+    end
+
+    # Array, which image types are listed for the post.
+    # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
+    def img_types
+      unless @img_types
+        @img_types = []
+        
+        @img_types << :img if images.length > 0
+        @img_types << :pic if pics.length > 0
+      end
+      
+      @img_types
+    end
+    
+    # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However, 
+    # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary
+    def section
+      unless @section
+        @section = full_section.last if full_section  
+      end
+      
+      @section
+    end
+
+    # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server. 
+    # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
+    def has_img?
+      img_types.include? :img
+    end
+
+    # true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers
+    # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
+    def has_pic?
+      img_types.include? :pic
+    end
+
+    # true if post summary has either the img or pic label
+    # This is always able to be pulled from the listing post-summary, and should never cause an additional page load
+    def has_pic_or_img?
+      img_types.length > 0
+    end
+
+    # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary
+    # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape.
+    def price
+      $1.tr('$','').to_f if label and PRICE.match label
+    end
+    
+    # Returns the post contents with all html tags removed
+    def contents_as_plain
+      strip_html contents
+    end
+
+    # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a 
+    # 'system_post' we may get tags in here
+    def header_as_plain
+      strip_html header
+    end
+
+    # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original 
+    # This returns true or false if that case applies
+    def system_post?
+      [contents,posting_id,post_time,title].all?{|f| f.nil?}
+    end
+
+    private
+
+    # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place
+    # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so:
+    # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack
+    def user_body     
+      $1 if USERBODY_PARTS.match html.to_s
+    end
+    
+    # Read the notes on user_body. However,  unlike the user_body, the craigslist portion of this div can be relied upon to be valid html. 
+    # So - we'll return it as an Hpricot object.
+    def craigslist_body
+      Hpricot.parse $2 if USERBODY_PARTS.match html.to_s
+    end
+
+  end
+end
\ No newline at end of file
diff --git a/lib/scraper.rb b/lib/scraper.rb
new file mode 100644
index 0000000..c14fea5
--- /dev/null
+++ b/lib/scraper.rb
@@ -0,0 +1,174 @@
+# TODO: file rdoc
+
+require 'net/http'
+require 'zlib'
+
+require 'rubygems'
+require 'activesupport'
+require 'hpricot'
+require 'htmlentities'
+
+class CraigScrape
+  
+  # Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related 
+  # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html
+  # methods. It also contains the http-related cattr_accessors:
+  # 
+  # *logger* - a Logger object to debug http notices too. Defaults to nil
+  #
+  # *retries_on_fetch_fail* - The number of times to retry a failed uri download. Defaults to 4
+  #
+  # *sleep_between_fetch_retries* - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 15.
+  class Scraper
+    cattr_accessor :logger
+    cattr_accessor :sleep_between_fetch_retries
+    cattr_accessor :retries_on_fetch_fail
+
+    URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/
+    HTML_TAG  = /<\/?[^>]*>/
+    
+    # Returns the full url that corresponds to this resource
+    attr_reader :url
+
+    # Set some defaults:
+    self.retries_on_fetch_fail = 4
+    self.sleep_between_fetch_retries = 15
+  
+    class BadConstructionError < StandardError #:nodoc:
+    end
+  
+    class ParseError < StandardError #:nodoc:
+    end
+  
+    class BadUrlError < StandardError #:nodoc:
+    end
+  
+    class FetchError < StandardError #:nodoc:
+    end
+    
+    # Scraper Objects can be created from either a full URL (string), or a Hash.
+    # Currently, this initializer isn't intended to be called from libcraigslist API users, though
+    # if you know what you're doing - feel free to try this out.
+    #
+    # A (string) url can be passed in a 'http://' scheme or a 'file://' scheme.
+    #
+    # When constructing from a hash, the keys in the hash will be used to set the object's corresponding values.
+    # This is useful to create an object without actually making an html request, this is used to set-up an
+    # object before it eager-loads any values not already passed in by the constructor hash. Though optional, if
+    # you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash,
+    # Otherwise this will fail to eager load.
+    def initialize(init_via = nil)
+      if init_via.nil?
+        # Do nothing - possibly not a great idea, but we'll allow it
+      elsif init_via.kind_of? String
+        @url = init_via
+      elsif init_via.kind_of? Hash
+        init_via.each_pair{|k,v| instance_variable_set "@#{k}", v}
+      else
+        raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect])
+      end
+    end
+    
+    # Indicates whether the resource has yet been retrieved from its associated url.
+    # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load,
+    # but hasn't yet been fetched.
+    def downloaded?; !@html.nil?; end
+
+    # A URI object corresponding to this Scraped URL
+    def uri
+      @uri ||= URI.parse @url if @url
+      @uri
+    end
+
+    private
+    
+    # Returns text with all html tags removed.
+    def strip_html(str)
+      str.gsub HTML_TAG, "" if str
+    end
+    
+    # Easy way to fail noisily:
+    def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end
+    
+    # Returns text with all html entities converted to respective ascii character.
+    def he_decode(text); self.class.he_decode text; end
+
+    # Returns text with all html entities converted to respective ascii character.
+    def self.he_decode(text); HTMLEntities.new.decode text; end
+    
+    # Derives a full url, using the current object's url and the provided href
+    def url_from_href(href) #:nodoc:
+      scheme, host, path = $1, $2, $3 if URL_PARTS.match href
+
+      scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme
+
+      host = uri.host if host.nil? or host.empty? and uri.respond_to? :host
+
+      path = (
+        (/\/$/.match(uri.path)) ?
+          '%s%s'  % [uri.path,path] :
+          '%s/%s' % [File.dirname(uri.path),path]
+      ) unless /^\//.match path
+
+      '%s://%s%s' % [scheme, host, path]
+    end
+    
+    def fetch_uri(uri)
+      logger.info "Requesting: %s" % @url if logger
+  
+      case uri.scheme
+        when 'file'
+          File.read uri.path
+        when /^http[s]?/
+          fetch_http uri
+        else
+          raise BadUrlError, "Unknown URI scheme for the url: #{@url}"
+      end
+    end
+    
+    def fetch_http(uri)
+      fetch_attempts = 0
+        
+      begin
+        # This handles the redirects for us          
+        resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil
+    
+        if resp.response.code == "200"
+          # Check for gzip, and decode:
+          data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip'
+          
+          data
+        elsif resp.response['Location']
+          redirect_to = resp.response['Location']
+          
+          fetch_uri URI.parse(url_from_href(redirect_to))
+        else
+          # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want
+          error_description = 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ]
+  
+          logger.info error_description if logger
+          
+          raise FetchError, error_description
+        end
+      rescue FetchError,Timeout::Error,Errno::ECONNRESET => err
+        logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error
+        logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET
+        
+        fetch_attempts += 1
+
+        if fetch_attempts <= self.retries_on_fetch_fail
+          sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries
+          logger.info 'Retrying fetch ....' if logger
+          retry
+        else
+          raise err
+        end
+      end
+    end
+    
+    def html
+      @html ||= Hpricot.parse fetch_uri(uri) if uri
+      @html
+    end
+  end  
+end
\ No newline at end of file
diff --git a/roodi.yml b/roodi.yml
new file mode 100644
index 0000000..adb3796
--- /dev/null
+++ b/roodi.yml
@@ -0,0 +1,15 @@
+# AssignmentInConditionalCheck:    { }
+# CaseMissingElseCheck:            { }
+ClassLineCountCheck:             { line_count: 300 }
+ClassNameCheck:                  { pattern: !ruby/regexp /^[A-Z][a-zA-Z0-9]*$/ }
+# ClassVariableCheck:              { }
+CyclomaticComplexityBlockCheck:  { complexity: 12 }
+CyclomaticComplexityMethodCheck: { complexity: 14 }
+EmptyRescueBodyCheck:            { }
+ForLoopCheck:                    { }
+MethodLineCountCheck:            { line_count: 50 }
+MethodNameCheck:                 { pattern: !ruby/regexp /^[_a-z<>=\[|+-\/\*`]+[_a-z0-9_<>=~@\[\]]*[=!\?]?$/ }
+ModuleLineCountCheck:            { line_count: 300 }
+ModuleNameCheck:                 { pattern: !ruby/regexp /^[A-Z][a-zA-Z0-9]*$/ }
+ParameterNumberCheck:            { parameter_count: 5 }
+