From e73e9d9eb95edb048c27d1d030421babe17e644e Mon Sep 17 00:00:00 2001 From: Chris DeRose Date: Sun, 5 Jul 2009 20:39:57 +0000 Subject: [PATCH] Added the code_tests tasks. Refactored scraper to accomodate the suggestions, broke each objct into its on rb file in lib --- CHANGELOG | 3 + Rakefile | 43 ++- TODO.txt | 54 ++++ lib/geo_listings.rb | 46 ++++ lib/libcraigscrape.rb | 625 +----------------------------------------- lib/listings.rb | 137 +++++++++ lib/posting.rb | 291 ++++++++++++++++++++ lib/scraper.rb | 174 ++++++++++++ roodi.yml | 15 + 9 files changed, 765 insertions(+), 623 deletions(-) create mode 100644 TODO.txt create mode 100644 lib/geo_listings.rb create mode 100644 lib/listings.rb create mode 100644 lib/posting.rb create mode 100644 lib/scraper.rb create mode 100644 roodi.yml diff --git a/CHANGELOG b/CHANGELOG index 80a7300..2fab6a6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,8 @@ == Change Log +=== Release 0.8.0 (TODO, 2009) +- Added :code_tests to the rakefile + === Release 0.7.0 (Jul 5, 2009) - A good bit of refactoring - Eager-loading in the Post object without the need of the full_post method diff --git a/Rakefile b/Rakefile index e486980..285e6c6 100644 --- a/Rakefile +++ b/Rakefile @@ -11,7 +11,7 @@ include FileUtils RbConfig = Config unless defined? RbConfig NAME = "libcraigscrape" -VERS = ENV['VERSION'] || "0.7.0" +VERS = ENV['VERSION'] || "0.8.0" PKG = "#{NAME}-#{VERS}" RDOC_OPTS = ['--quiet', '--title', 'The libcraigscrape Reference', '--main', 'README', '--inline-source'] @@ -77,3 +77,44 @@ task :uninstall => [:clean] do sh %{sudo gem uninstall #{NAME}} end +require 'roodi' +require 'roodi_task' + +namespace :code_tests do + desc "Analyze for code complexity" + task :flog do + require 'flog' + + flog = Flog.new + flog.flog_files ['lib'] + threshold = 105 + + bad_methods = flog.totals.select do |name, score| + score > threshold + end + + bad_methods.sort { |a,b| a[1] <=> b[1] }.each do |name, score| + puts "%8.1f: %s" % [score, name] + end + + puts "WARNING : #{bad_methods.size} methods have a flog complexity > #{threshold}" unless bad_methods.empty? + end + + desc "Analyze for code duplication" + require 'flay' + task :flay do + threshold = 25 + flay = Flay.new({:fuzzy => false, :verbose => false, :mass => threshold}) + flay.process(*Flay.expand_dirs_to_files(['lib'])) + + flay.report + + raise "#{flay.masses.size} chunks of code have a duplicate mass > #{threshold}" unless flay.masses.empty? + end + + RoodiTask.new 'roodi', ['lib/*.rb'], 'roodi.yml' +end + +desc "Run all code tests" +task :code_tests => %w(code_tests:flog code_tests:flay code_tests:roodi) + diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000..b79b3ba --- /dev/null +++ b/TODO.txt @@ -0,0 +1,54 @@ +0.8.0 TODO: + * It'd be nice to let the yaml's not need a full path to the .db, just use dir(__FILE__) as cwd for that + * We should hve a listings.next_page which returns the next page - that would clean up our while loop a bit + * Change craigwatch's default non-regex search to be case insensitive + * Reduce memory consumption in craigwatch. + * I think we need to update the package to include the enw rake tasks for flogging + * Add some rdoc text to the top of all the new lib files... + +Post-0.7: + * A debug_craigwatch, which shows the current progress... (pages fetched, objects in caches..) + * Some pages are legitimate 404's and we just can't parse them no matter how hard we try - what to do about this? + * Break the scraper objects into separate files... + * Maybe we should make an instance out of CraigScrape.new('us/fl/south florida') kind of thing.. + * Finsih testing out that geo location todo list + * Test out that array-paramter to the GeoListings constructor, make sure it actually works + * integrate it better nito craigscrape + * It'd be nice to tell craigscrape 'us/ca' or 'us/ca/losangeles' as the scrape location + * and maybe have 'search text" and "search section" type stuff where everything ends up scraping from there.. + * We should really cache pages if we're going to do this - and I'd say to cache the geolisting pages first... + * Stats in the email: bytes transferred, generation time, urls scrapped, posts scrapped + + * It'd also be nice to run an erb over the yaml file? No, we should take some steps to DRY out the code though. + * Particularly with respect to the searches which the same regex for multiple searches. + * and particularly with those searches which are usingt he same listings urls to search for different things (IE 'cta' searches) + +Recheks in a week (5.11.09 was last tried) + + * This thread: + http://sfbay.craigslist.org/forums/?ID=29345737 + Title: craigwatch does this - if you're a little handy + Message: + craigwatch and libcraigscrape are a tightly-coupled, ruby solution for (largely) unix-based systems. +
+
+ Check it out here: +
+ http://www.derosetechnologies.com/community/libcraigscrape + * http://www.craigslistwatch.com/ + * Did this actuallyt post?: http://digg.com/tech_news/Stop_wasting_money_use_Craigslist_Watch + +email: + + http://www.dostuffright.com/Craigwatch + http://wareseeker.com/Network-Internet/Craigslist-All-City-Search-Tool-1.2.zip/8036652 + http://www.killerstartups.com/Search/craigslittlebuddy-com-multiple-city-craigslist-search + +Scripts aggregators: + bigwebmaster.com + http://www.scripts.com/ + http://www.scriptarchive.com/ + http://www.needscripts.com/ + http://www.scriptsearch.com/ + http://www.sitescripts.com/PHP/ + http://www.scriptsbank.com/ diff --git a/lib/geo_listings.rb b/lib/geo_listings.rb new file mode 100644 index 0000000..b50bb7e --- /dev/null +++ b/lib/geo_listings.rb @@ -0,0 +1,46 @@ +# TODO: file rdoc + +require 'scraper' + +class CraigScrape + # GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us]) + # These list all the craigslist sites in a given region. + class GeoListings < Scraper + LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/ + GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/} + + # The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'. + # In addition though, here we'll accept an array like %w(us fl) which gets converted to + # {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl] + def initialize(init_via = nil) + super init_via.kind_of?(Array) ? "#{GEOLISTING_BASE_URL}#{init_via.join '/'}" : init_via + + # Validate that required fields are present, at least - if we've downloaded it from a url + parse_error! unless location + end + + # Returns the GeoLocation's full name + def location + unless @name + cursor = html % 'h3 > b > a:first-of-type' + cursor = cursor.next_node if cursor + @name = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s) + end + + @name + end + + # Returns a hash of site name to urls in the current listing + def sites + unless @sites + @sites = {} + (html / 'div#list > a').each do |el_a| + site_name = he_decode strip_html(el_a.inner_html) + @sites[site_name] = el_a[:href] + end + end + + @sites + end + end +end \ No newline at end of file diff --git a/lib/libcraigscrape.rb b/lib/libcraigscrape.rb index 7632ad8..0cced26 100644 --- a/lib/libcraigscrape.rb +++ b/lib/libcraigscrape.rb @@ -2,13 +2,10 @@ # # All of libcraigscrape's objects and methods are loaded when you use require 'libcraigscrape' in your code. # -require 'net/http' -require 'zlib' -require 'rubygems' -require 'hpricot' -require 'htmlentities' -require 'activesupport' +require 'listings' +require 'posting' +require 'geo_listings' # A base class encapsulating the libcraigscrape objects, and providing some utility methods. class CraigScrape @@ -78,620 +75,4 @@ def self.most_recently_expired_time(month, day) #:nodoc: ret end - # Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related - # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html - # methods. It also contains the http-related cattr_accessors: - # - # *logger* - a Logger object to debug http notices too. Defaults to nil - # - # *retries_on_fetch_fail* - The number of times to retry a failed uri download. Defaults to 4 - # - # *sleep_between_fetch_retries* - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 15. - class Scraper - cattr_accessor :logger - cattr_accessor :sleep_between_fetch_retries - cattr_accessor :retries_on_fetch_fail - - URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/ - HTML_TAG = /<\/?[^>]*>/ - - # Returns the full url that corresponds to this resource - attr_reader :url - - # Set some defaults: - self.retries_on_fetch_fail = 4 - self.sleep_between_fetch_retries = 15 - - class BadConstructionError < StandardError #:nodoc: - end - - class ParseError < StandardError #:nodoc: - end - - class BadUrlError < StandardError #:nodoc: - end - - class FetchError < StandardError #:nodoc: - end - - # Scraper Objects can be created from either a full URL (string), or a Hash. - # Currently, this initializer isn't intended to be called from libcraigslist API users, though - # if you know what you're doing - feel free to try this out. - # - # A (string) url can be passed in a 'http://' scheme or a 'file://' scheme. - # - # When constructing from a hash, the keys in the hash will be used to set the object's corresponding values. - # This is useful to create an object without actually making an html request, this is used to set-up an - # object before it eager-loads any values not already passed in by the constructor hash. Though optional, if - # you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash, - # Otherwise this will fail to eager load. - def initialize(init_via = nil) - if init_via.nil? - # Do nothing - possibly not a great idea, but we'll allow it - elsif init_via.kind_of? String - @url = init_via - elsif init_via.kind_of? Hash - init_via.each_pair{|k,v| instance_variable_set "@#{k}", v} - else - raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect]) - end - end - - # Indicates whether the resource has yet been retrieved from its associated url. - # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load, - # but hasn't yet been fetched. - def downloaded?; !@html.nil?; end - - # A URI object corresponding to this Scraped URL - def uri - @uri ||= URI.parse @url if @url - @uri - end - - private - - # Returns text with all html tags removed. - def strip_html(str) - str.gsub HTML_TAG, "" if str - end - - # Easy way to fail noisily: - def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end - - # Returns text with all html entities converted to respective ascii character. - def he_decode(text); self.class.he_decode text; end - - # Returns text with all html entities converted to respective ascii character. - def self.he_decode(text); HTMLEntities.new.decode text; end - - # Derives a full url, using the current object's url and the provided href - def url_from_href(href) #:nodoc: - scheme, host, path = $1, $2, $3 if URL_PARTS.match href - - scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme - - host = uri.host if host.nil? or host.empty? and uri.respond_to? :host - - path = ( - (/\/$/.match(uri.path)) ? - '%s%s' % [uri.path,path] : - '%s/%s' % [File.dirname(uri.path),path] - ) unless /^\//.match path - - '%s://%s%s' % [scheme, host, path] - end - - def fetch_uri(uri) - - logger.info "Requesting: %s" % @url if logger - - case uri.scheme - when 'file' - File.read uri.path - when /^http[s]?/ - fetch_attempts = 0 - - begin - # This handles the redirects for us - resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil - - if resp.response.code == "200" - # Check for gzip, and decode: - data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip' - - data - elsif resp.response['Location'] - redirect_to = resp.response['Location'] - - fetch_uri URI.parse(url_from_href(redirect_to)) - else - # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want - error_description = 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ] - - logger.info error_description if logger - - raise FetchError, error_description - end - rescue FetchError,Timeout::Error,Errno::ECONNRESET => err - logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error - logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET - - fetch_attempts += 1 - - if fetch_attempts <= self.retries_on_fetch_fail - sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries - logger.info 'Retrying fetch ....' if logger - retry - else - raise err - end - end - else - raise BadUrlError, "Unknown URI scheme for the url: #{@url}" - end - end - - def html - @html ||= Hpricot.parse fetch_uri(uri) if uri - @html - end - end - - # Posting represents a fully downloaded, and parsed, Craigslist post. - # This class is generally returned by the listing scrape methods, and - # contains the post summaries for a specific search url, or a general listing category - class Posting < Scraper - - POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i - LOCATION = /Location\:[ ]+(.+)/ - HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/ - POSTING_ID = /PostingID\:[ ]+([\d]+)/ - REPLY_TO = /(.+)/ - PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/ - USERBODY_PARTS = /\
(.+)\\(.+)\<\/div\>/m - IMAGE_SRC = /\]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/ - - # This is really just for testing, in production use, uri.path is a better solution - attr_reader :href #:nodoc: - - # Create a new Post via a url (String), or supplied parameters (Hash) - def initialize(*args) - super(*args) - - # Validate that required fields are present, at least - if we've downloaded it from a url - parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [ - contents,posting_id,post_time,header,title,full_section - ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)} - end - - - # String, The contents of the item's html body heading - def header - unless @header - h2 = html.at 'h2' if html - @header = he_decode h2.inner_html if h2 - end - - @header - end - - # String, the item's title - def title - unless @title - title_tag = html.at 'title' if html - @title = he_decode title_tag.inner_html if title_tag - @title = nil if @title and @title.length == 0 - end - - @title - end - - # Array, hierarchial representation of the posts section - def full_section - unless @full_section - @full_section = [] - - (html/"div[@class='bchead']//a").each do |a| - @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef' - end if html - end - - @full_section - end - - # String, represents the post's reply-to address, if listed - def reply_to - unless @reply_to - cursor = html.at 'hr' if html - cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a' - @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html) - end - - @reply_to - end - - # Time, reflects the full timestamp of the posting - def post_time - unless @post_time - cursor = html.at 'hr' if html - cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s - @post_time = Time.parse $1 if $1 - end - - @post_time - end - - # Integer, Craigslist's unique posting id - def posting_id - unless @posting_id - cursor = (html/"#userbody").first if html - cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s - @posting_id = $1.to_i if $1 - end - - @posting_id - end - - # String, The full-html contents of the post - def contents - unless @contents - @contents = user_body if html - @contents = he_decode @contents.strip if @contents - end - - @contents - end - - # String, the location of the item, as best could be parsed - def location - if @location.nil? and craigslist_body and html - # Location (when explicitly defined): - cursor = craigslist_body.at 'ul' unless @location - - # Apa section includes other things in the li's (cats/dogs ok fields) - cursor.children.each do |li| - if LOCATION.match li.inner_html - @location = he_decode($1) and break - break - end - end if cursor - - # Real estate listings can work a little different for location: - unless @location - cursor = craigslist_body.at 'small' - cursor = cursor.previous_node until cursor.nil? or cursor.text? - - @location = he_decode(cursor.to_s.strip) if cursor - end - - # So, *sometimes* the location just ends up being in the header, I don't know why: - @location = $1 if @location.nil? and HEADER_LOCATION.match header - end - - @location - end - - # Array, urls of the post's images that are *not* hosted on craigslist - def images - # Keep in mind that when users post html to craigslist, they're often not posting wonderful html... - @images = ( - contents ? - contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } : - [] - ) unless @images - - @images - end - - # Array, urls of the post's craigslist-hosted images - def pics - unless @pics - @pics = [] - - if html and craigslist_body - # Now let's find the craigslist hosted images: - img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'} - - @pics = (img_table / 'img').collect{|i| i[:src]} if img_table - end - end - - @pics - end - - # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page - def flagged_for_removal? - @flagged_for_removal = ( - system_post? and header_as_plain == "This posting has been flagged for removal" - ) if @flagged_for_removal.nil? - - @flagged_for_removal - end - - # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice - def deleted_by_author? - @deleted_by_author = ( - system_post? and header_as_plain == "This posting has been deleted by its author." - ) if @deleted_by_author.nil? - - @deleted_by_author - end - - - # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely - # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape. - def post_date - @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil? - - @post_date - end - - # Returns The post label. The label would appear at first glance to be indentical to the header - but its not. - # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location. - # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label - # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result - # in a full page load from the post's url. - def label - unless @label or system_post? - @label = header - - @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label - end - - @label - end - - # Array, which image types are listed for the post. - # This is always able to be pulled from the listing post-summary, and should never cause an additional page load - def img_types - unless @img_types - @img_types = [] - - @img_types << :img if images.length > 0 - @img_types << :pic if pics.length > 0 - end - - @img_types - end - - # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However, - # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary - def section - unless @section - @section = full_section.last if full_section - end - - @section - end - - # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server. - # This is always able to be pulled from the listing post-summary, and should never cause an additional page load - def has_img? - img_types.include? :img - end - - # true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers - # This is always able to be pulled from the listing post-summary, and should never cause an additional page load - def has_pic? - img_types.include? :pic - end - - # true if post summary has either the img or pic label - # This is always able to be pulled from the listing post-summary, and should never cause an additional page load - def has_pic_or_img? - img_types.length > 0 - end - - # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary - # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape. - def price - $1.tr('$','').to_f if label and PRICE.match label - end - - # Returns the post contents with all html tags removed - def contents_as_plain - strip_html contents - end - - # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a - # 'system_post' we may get tags in here - def header_as_plain - strip_html header - end - - # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original - # This returns true or false if that case applies - def system_post? - [contents,posting_id,post_time,title].all?{|f| f.nil?} - end - - private - - # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place - # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so: - # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack - def user_body - $1 if USERBODY_PARTS.match html.to_s - end - - # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html. - # So - we'll return it as an Hpricot object. - def craigslist_body - Hpricot.parse $2 if USERBODY_PARTS.match html.to_s - end - - end - - # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing - class Listings < Scraper - LABEL = /^(.+?)[ ]*\-$/ - LOCATION = /^[ ]*\((.*?)\)$/ - IMG_TYPE = /^[ ]*(.+)[ ]*$/ - HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/ - SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/ - NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/ - - # Array, PostSummary objects found in the listing - def posts - unless @posts - current_date = nil - @posts = [] - - post_tags = html.get_elements_by_tag_name('p','h4') - - # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output: - post_tags.pop if ( - post_tags.length > 0 and - post_tags.last.at('a') and - NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html - ) - - # Now we iterate though the listings: - post_tags.each do |el| - case el.name - when 'p' - post_summary = self.class.parse_summary el, current_date - - # Validate that required fields are present: - parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0} - - post_summary[:url] = url_from_href post_summary[:href] - - @posts << CraigScrape::Posting.new(post_summary) - when 'h4' - # Let's make sense of the h4 tag, and then read all the p tags below it - if HEADER_DATE.match he_decode(el.inner_html) - # Generally, the H4 tags contain valid dates. When they do - this is easy: - current_date = CraigScrape.most_recently_expired_time $1, $2 - elsif html.at('h4:last-of-type') == el - # There's a specific bug, where these nonsense h4's just appear without anything relevant inside them. - # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page, - # we need to pull up the full post in order to accurate tell the date. - # Setting this to nil will achieve the eager-load. - current_date = nil - end - end - end - end - - @posts - end - - # String, URL Path href-fragment of the next page link - def next_page_href - unless @next_page_href - cursor = html.at 'p:last-of-type' - - cursor = cursor.at 'a' if cursor - - # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag - next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html - - # Search listings put their next page in a link towards the top - next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == 'Next>>' } unless next_link - - # Some search pages have a bug, whereby a 'next page' link isn't displayed, - # even though we can see that theres another page listed in the page-number links block at the top - # and bottom of the listing page - unless next_link - cursor = html % 'div.sh:first-of-type > b:last-of-type' - - # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise - # We're looking good. - next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html - end - - # We have an anchor tag - so - let's assign the href: - @next_page_href = next_link[:href] if next_link - end - - @next_page_href - end - - # String, Full URL Path of the 'next page' link - def next_page_url - (next_page_href) ? url_from_href(next_page_href) : nil - end - - # Takes a paragraph element and returns a mostly-parsed Posting - # We separate this from the rest of the parsing both for readability and ease of testing - def self.parse_summary(p_element, date = nil) #:nodoc: - ret = {} - - title_anchor, section_anchor = p_element.search 'a' - location_tag = p_element.at 'font' - has_pic_tag = p_element.at 'span' - - href = nil - - location = he_decode p_element.at('font').inner_html if location_tag - ret[:location] = $1 if location and LOCATION.match location - - ret[:img_types] = [] - if has_pic_tag - img_type = he_decode has_pic_tag.inner_html - img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type - - ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym} - end - - ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor - - ret[:post_date] = date - if SUMMARY_DATE.match he_decode(p_element.children[0]) - ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i - end - - if title_anchor - label = he_decode title_anchor.inner_html - ret[:label] = $1 if LABEL.match label - - ret[:href] = title_anchor[:href] - end - - ret - end - end - - # GeoListings represents a parsed Craigslist geo lisiting page. (i.e. {'http://geo.craigslist.org/iso/us'}[http://geo.craigslist.org/iso/us]) - # These list all the craigslist sites in a given region. - class GeoListings < Scraper - LOCATION_NAME = /[ ]*\>[ ](.+)[ ]*/ - GEOLISTING_BASE_URL = %{http://geo.craigslist.org/iso/} - - # The geolisting constructor works like all other Scraper objects, in that it accepts a string 'url'. - # In addition though, here we'll accept an array like %w(us fl) which gets converted to - # {'http://geo.craigslist.org/iso/us/fl'}[http://geo.craigslist.org/iso/us/fl] - def initialize(init_via = nil) - super init_via.kind_of?(Array) ? "#{GEOLISTING_BASE_URL}#{init_via.join '/'}" : init_via - - # Validate that required fields are present, at least - if we've downloaded it from a url - parse_error! unless location - end - - # Returns the GeoLocation's full name - def location - unless @name - cursor = html % 'h3 > b > a:first-of-type' - cursor = cursor.next_node if cursor - @name = $1 if cursor and LOCATION_NAME.match he_decode(cursor.to_s) - end - - @name - end - - # Returns a hash of site name to urls in the current listing - def sites - unless @sites - @sites = {} - (html / 'div#list > a').each do |el_a| - site_name = he_decode strip_html(el_a.inner_html) - @sites[site_name] = el_a[:href] - end - end - - @sites - end - end - end \ No newline at end of file diff --git a/lib/listings.rb b/lib/listings.rb new file mode 100644 index 0000000..a7a63d5 --- /dev/null +++ b/lib/listings.rb @@ -0,0 +1,137 @@ +# TODO: file rdoc + +require 'scraper' + +class CraigScrape + + # Listings represents a parsed Craigslist listing page and is generally returned by CraigScrape.scrape_listing + class Listings < Scraper + LABEL = /^(.+?)[ ]*\-$/ + LOCATION = /^[ ]*\((.*?)\)$/ + IMG_TYPE = /^[ ]*(.+)[ ]*$/ + HEADER_DATE = /^[ ]*[^ ]+[ ]+([^ ]+)[ ]+([^ ]+)[ ]*$/ + SUMMARY_DATE = /^[ ]([^ ]+)[ ]+([^ ]+)[ ]*[\-][ ]*$/ + NEXT_PAGE_LINK = /^[ ]*next [\d]+ postings[ ]*$/ + + # Array, PostSummary objects found in the listing + def posts + unless @posts + current_date = nil + @posts = [] + + post_tags = html.get_elements_by_tag_name('p','h4') + + # The last p in the list is sometimes a 'next XXX pages' link. We don't want to include this in our PostSummary output: + post_tags.pop if ( + post_tags.length > 0 and + post_tags.last.at('a') and + NEXT_PAGE_LINK.match post_tags.last.at('a').inner_html + ) + + # Now we iterate though the listings: + post_tags.each do |el| + case el.name + when 'p' + post_summary = self.class.parse_summary el, current_date + + # Validate that required fields are present: + parse_error! unless [post_summary[:label],post_summary[:href]].all?{|f| f and f.length > 0} + + post_summary[:url] = url_from_href post_summary[:href] + + @posts << CraigScrape::Posting.new(post_summary) + when 'h4' + # Let's make sense of the h4 tag, and then read all the p tags below it + if HEADER_DATE.match he_decode(el.inner_html) + # Generally, the H4 tags contain valid dates. When they do - this is easy: + current_date = CraigScrape.most_recently_expired_time $1, $2 + elsif html.at('h4:last-of-type') == el + # There's a specific bug, where these nonsense h4's just appear without anything relevant inside them. + # They're safe to ignore if they're not the last h4 on the page. I fthey're the last h4 on the page, + # we need to pull up the full post in order to accurate tell the date. + # Setting this to nil will achieve the eager-load. + current_date = nil + end + end + end + end + + @posts + end + + # String, URL Path href-fragment of the next page link + def next_page_href + unless @next_page_href + cursor = html.at 'p:last-of-type' + + cursor = cursor.at 'a' if cursor + + # Category Listings have their 'next 100 postings' link at the end of the doc in a p tag + next_link = cursor if cursor and NEXT_PAGE_LINK.match cursor.inner_html + + # Search listings put their next page in a link towards the top + next_link = (html / 'a').find{ |a| he_decode(a.inner_html) == 'Next>>' } unless next_link + + # Some search pages have a bug, whereby a 'next page' link isn't displayed, + # even though we can see that theres another page listed in the page-number links block at the top + # and bottom of the listing page + unless next_link + cursor = html % 'div.sh:first-of-type > b:last-of-type' + + # If there's no 'a' in the next sibling, we'll have just performed a nil assignment, otherwise + # We're looking good. + next_link = cursor.next_sibling if cursor and /^[\d]+$/.match cursor.inner_html + end + + # We have an anchor tag - so - let's assign the href: + @next_page_href = next_link[:href] if next_link + end + + @next_page_href + end + + # String, Full URL Path of the 'next page' link + def next_page_url + (next_page_href) ? url_from_href(next_page_href) : nil + end + + # Takes a paragraph element and returns a mostly-parsed Posting + # We separate this from the rest of the parsing both for readability and ease of testing + def self.parse_summary(p_element, date = nil) #:nodoc: + ret = {} + + title_anchor, section_anchor = p_element.search 'a' + location_tag = p_element.at 'font' + has_pic_tag = p_element.at 'span' + + href = nil + + location = he_decode p_element.at('font').inner_html if location_tag + ret[:location] = $1 if location and LOCATION.match location + + ret[:img_types] = [] + if has_pic_tag + img_type = he_decode has_pic_tag.inner_html + img_type = $1.tr('^a-zA-Z0-9',' ') if IMG_TYPE.match img_type + + ret[:img_types] = img_type.split(' ').collect{|t| t.to_sym} + end + + ret[:section] = he_decode(section_anchor.inner_html).split("\302\240").join(" ") if section_anchor + + ret[:post_date] = date + if SUMMARY_DATE.match he_decode(p_element.children[0]) + ret[:post_date] = CraigScrape.most_recently_expired_time $1, $2.to_i + end + + if title_anchor + label = he_decode title_anchor.inner_html + ret[:label] = $1 if LABEL.match label + + ret[:href] = title_anchor[:href] + end + + ret + end + end +end \ No newline at end of file diff --git a/lib/posting.rb b/lib/posting.rb new file mode 100644 index 0000000..24fa08a --- /dev/null +++ b/lib/posting.rb @@ -0,0 +1,291 @@ +# TODO: file rdoc + +require 'scraper' + +class CraigScrape + + # Posting represents a fully downloaded, and parsed, Craigslist post. + # This class is generally returned by the listing scrape methods, and + # contains the post summaries for a specific search url, or a general listing category + class Posting < Scraper + + POST_DATE = /Date:[^\d]*((?:[\d]{2}|[\d]{4})\-[\d]{1,2}\-[\d]{1,2}[^\d]+[\d]{1,2}\:[\d]{1,2}[ ]*[AP]M[^a-z]+[a-z]+)/i + LOCATION = /Location\:[ ]+(.+)/ + HEADER_LOCATION = /^.+[ ]*\-[ ]*[\$]?[\d]+[ ]*\((.+)\)$/ + POSTING_ID = /PostingID\:[ ]+([\d]+)/ + REPLY_TO = /(.+)/ + PRICE = /((?:^\$[\d]+(?:\.[\d]{2})?)|(?:\$[\d]+(?:\.[\d]{2})?$))/ + USERBODY_PARTS = /\
(.+)\\(.+)\<\/div\>/m + IMAGE_SRC = /\]*src=(?:\'([^\']+)\'|\"([^\"]+)\"|([^ ]+))[^\>]*\>/ + + # This is really just for testing, in production use, uri.path is a better solution + attr_reader :href #:nodoc: + + # Create a new Post via a url (String), or supplied parameters (Hash) + def initialize(*args) + super(*args) + + # Validate that required fields are present, at least - if we've downloaded it from a url + parse_error! if args.first.kind_of? String and !flagged_for_removal? and !deleted_by_author? and [ + contents,posting_id,post_time,header,title,full_section + ].any?{|f| f.nil? or (f.respond_to? :length and f.length == 0)} + end + + + # String, The contents of the item's html body heading + def header + unless @header + h2 = html.at 'h2' if html + @header = he_decode h2.inner_html if h2 + end + + @header + end + + # String, the item's title + def title + unless @title + title_tag = html.at 'title' if html + @title = he_decode title_tag.inner_html if title_tag + @title = nil if @title and @title.length == 0 + end + + @title + end + + # Array, hierarchial representation of the posts section + def full_section + unless @full_section + @full_section = [] + + (html/"div[@class='bchead']//a").each do |a| + @full_section << he_decode(a.inner_html) unless a['id'] and a['id'] == 'ef' + end if html + end + + @full_section + end + + # String, represents the post's reply-to address, if listed + def reply_to + unless @reply_to + cursor = html.at 'hr' if html + cursor = cursor.next_sibling until cursor.nil? or cursor.name == 'a' + @reply_to = $1 if cursor and REPLY_TO.match he_decode(cursor.inner_html) + end + + @reply_to + end + + # Time, reflects the full timestamp of the posting + def post_time + unless @post_time + cursor = html.at 'hr' if html + cursor = cursor.next_node until cursor.nil? or POST_DATE.match cursor.to_s + @post_time = Time.parse $1 if $1 + end + + @post_time + end + + # Integer, Craigslist's unique posting id + def posting_id + unless @posting_id + cursor = (html/"#userbody").first if html + cursor = cursor.next_node until cursor.nil? or POSTING_ID.match cursor.to_s + @posting_id = $1.to_i if $1 + end + + @posting_id + end + + # String, The full-html contents of the post + def contents + unless @contents + @contents = user_body if html + @contents = he_decode @contents.strip if @contents + end + + @contents + end + + # String, the location of the item, as best could be parsed + def location + if @location.nil? and craigslist_body and html + # Location (when explicitly defined): + cursor = craigslist_body.at 'ul' unless @location + + # Apa section includes other things in the li's (cats/dogs ok fields) + cursor.children.each do |li| + if LOCATION.match li.inner_html + @location = he_decode($1) and break + break + end + end if cursor + + # Real estate listings can work a little different for location: + unless @location + cursor = craigslist_body.at 'small' + cursor = cursor.previous_node until cursor.nil? or cursor.text? + + @location = he_decode(cursor.to_s.strip) if cursor + end + + # So, *sometimes* the location just ends up being in the header, I don't know why: + @location = $1 if @location.nil? and HEADER_LOCATION.match header + end + + @location + end + + # Array, urls of the post's images that are *not* hosted on craigslist + def images + # Keep in mind that when users post html to craigslist, they're often not posting wonderful html... + @images = ( + contents ? + contents.scan(IMAGE_SRC).collect{ |a| a.find{|b| !b.nil? } } : + [] + ) unless @images + + @images + end + + # Array, urls of the post's craigslist-hosted images + def pics + unless @pics + @pics = [] + + if html and craigslist_body + # Now let's find the craigslist hosted images: + img_table = (craigslist_body / 'table').find{|e| e.name == 'table' and e[:summary] == 'craigslist hosted images'} + + @pics = (img_table / 'img').collect{|i| i[:src]} if img_table + end + end + + @pics + end + + # Returns true if this Post was parsed, and merely a 'Flagged for Removal' page + def flagged_for_removal? + @flagged_for_removal = ( + system_post? and header_as_plain == "This posting has been flagged for removal" + ) if @flagged_for_removal.nil? + + @flagged_for_removal + end + + # Returns true if this Post was parsed, and represents a 'This posting has been deleted by its author.' notice + def deleted_by_author? + @deleted_by_author = ( + system_post? and header_as_plain == "This posting has been deleted by its author." + ) if @deleted_by_author.nil? + + @deleted_by_author + end + + + # Reflects only the date portion of the posting. Does not include hours/minutes. This is useful when reflecting the listing scrapes, and can be safely + # used if you wish conserve bandwidth by not pulling an entire post from a listing scrape. + def post_date + @post_date = Time.local(*[0]*3+post_time.to_a[3...10]) unless @post_date or post_time.nil? + + @post_date + end + + # Returns The post label. The label would appear at first glance to be indentical to the header - but its not. + # The label is cited on the listings pages, and generally includes everything in the header - with the exception of the location. + # Sometimes there's additional information ie. '(map)' on rea listings included in the header, that aren't to be listed in the label + # This is also used as a bandwidth shortcut for the craigwatch program, and is a guaranteed identifier for the post, that won't result + # in a full page load from the post's url. + def label + unless @label or system_post? + @label = header + + @label = $1 if location and /(.+?)[ ]*\(#{location}\).*?$/.match @label + end + + @label + end + + # Array, which image types are listed for the post. + # This is always able to be pulled from the listing post-summary, and should never cause an additional page load + def img_types + unless @img_types + @img_types = [] + + @img_types << :img if images.length > 0 + @img_types << :pic if pics.length > 0 + end + + @img_types + end + + # Retrieves the most-relevant craigslist 'section' of the post. This is *generally* the same as full_section.last. However, + # this (sometimes/rarely) conserves bandwidth by pulling this field from the listing post-summary + def section + unless @section + @section = full_section.last if full_section + end + + @section + end + + # true if post summary has 'img(s)'. 'imgs' are different then pics, in that the resource is *not* hosted on craigslist's server. + # This is always able to be pulled from the listing post-summary, and should never cause an additional page load + def has_img? + img_types.include? :img + end + + # true if post summary has 'pic(s)'. 'pics' are different then imgs, in that craigslist is hosting the resource on craigslist's servers + # This is always able to be pulled from the listing post-summary, and should never cause an additional page load + def has_pic? + img_types.include? :pic + end + + # true if post summary has either the img or pic label + # This is always able to be pulled from the listing post-summary, and should never cause an additional page load + def has_pic_or_img? + img_types.length > 0 + end + + # Returns the best-guess of a price, judging by the label's contents. Price is available when pulled from the listing summary + # and can be safely used if you wish conserve bandwidth by not pulling an entire post from a listing scrape. + def price + $1.tr('$','').to_f if label and PRICE.match label + end + + # Returns the post contents with all html tags removed + def contents_as_plain + strip_html contents + end + + # Returns the header with all html tags removed. Granted, the header should usually be plain, but in the case of a + # 'system_post' we may get tags in here + def header_as_plain + strip_html header + end + + # Some posts (deleted_by_author, flagged_for_removal) are common template posts that craigslist puts up in lieu of an original + # This returns true or false if that case applies + def system_post? + [contents,posting_id,post_time,title].all?{|f| f.nil?} + end + + private + + # OK - so the biggest problem parsing the contents of a craigslist post is that users post invalid html all over the place + # This bad html trips up hpricot, and I've resorted to splitting the page up using string parsing like so: + # We return this as a string, since it makes sense, and since its tough to say how hpricot might mangle this if the html is whack + def user_body + $1 if USERBODY_PARTS.match html.to_s + end + + # Read the notes on user_body. However, unlike the user_body, the craigslist portion of this div can be relied upon to be valid html. + # So - we'll return it as an Hpricot object. + def craigslist_body + Hpricot.parse $2 if USERBODY_PARTS.match html.to_s + end + + end +end \ No newline at end of file diff --git a/lib/scraper.rb b/lib/scraper.rb new file mode 100644 index 0000000..c14fea5 --- /dev/null +++ b/lib/scraper.rb @@ -0,0 +1,174 @@ +# TODO: file rdoc + +require 'net/http' +require 'zlib' + +require 'rubygems' +require 'activesupport' +require 'hpricot' +require 'htmlentities' + +class CraigScrape + + # Scraper is a general-pupose base class for all libcraigscrape Objects. Scraper facilitates all http-related + # functionality, and adds some useful helpers for dealing with eager-loading of http-objects and general html + # methods. It also contains the http-related cattr_accessors: + # + # *logger* - a Logger object to debug http notices too. Defaults to nil + # + # *retries_on_fetch_fail* - The number of times to retry a failed uri download. Defaults to 4 + # + # *sleep_between_fetch_retries* - The amount of seconds to sleep, between successive attempts in the case of a failed download. Defaults to 15. + class Scraper + cattr_accessor :logger + cattr_accessor :sleep_between_fetch_retries + cattr_accessor :retries_on_fetch_fail + + URL_PARTS = /^(?:([^\:]+)\:\/\/([^\/]*))?(.*)$/ + HTML_TAG = /<\/?[^>]*>/ + + # Returns the full url that corresponds to this resource + attr_reader :url + + # Set some defaults: + self.retries_on_fetch_fail = 4 + self.sleep_between_fetch_retries = 15 + + class BadConstructionError < StandardError #:nodoc: + end + + class ParseError < StandardError #:nodoc: + end + + class BadUrlError < StandardError #:nodoc: + end + + class FetchError < StandardError #:nodoc: + end + + # Scraper Objects can be created from either a full URL (string), or a Hash. + # Currently, this initializer isn't intended to be called from libcraigslist API users, though + # if you know what you're doing - feel free to try this out. + # + # A (string) url can be passed in a 'http://' scheme or a 'file://' scheme. + # + # When constructing from a hash, the keys in the hash will be used to set the object's corresponding values. + # This is useful to create an object without actually making an html request, this is used to set-up an + # object before it eager-loads any values not already passed in by the constructor hash. Though optional, if + # you're going to be setting this object up for eager-loadnig, be sure to pass in a :url key in your hash, + # Otherwise this will fail to eager load. + def initialize(init_via = nil) + if init_via.nil? + # Do nothing - possibly not a great idea, but we'll allow it + elsif init_via.kind_of? String + @url = init_via + elsif init_via.kind_of? Hash + init_via.each_pair{|k,v| instance_variable_set "@#{k}", v} + else + raise BadConstructionError, ("Unrecognized parameter passed to %s.new %s}" % [self.class.to_s, init_via.class.inspect]) + end + end + + # Indicates whether the resource has yet been retrieved from its associated url. + # This is useful to distinguish whether the instance was instantiated for the purpose of an eager-load, + # but hasn't yet been fetched. + def downloaded?; !@html.nil?; end + + # A URI object corresponding to this Scraped URL + def uri + @uri ||= URI.parse @url if @url + @uri + end + + private + + # Returns text with all html tags removed. + def strip_html(str) + str.gsub HTML_TAG, "" if str + end + + # Easy way to fail noisily: + def parse_error!; raise ParseError, "Error while parsing %s:\n %s" % [self.class.to_s, html]; end + + # Returns text with all html entities converted to respective ascii character. + def he_decode(text); self.class.he_decode text; end + + # Returns text with all html entities converted to respective ascii character. + def self.he_decode(text); HTMLEntities.new.decode text; end + + # Derives a full url, using the current object's url and the provided href + def url_from_href(href) #:nodoc: + scheme, host, path = $1, $2, $3 if URL_PARTS.match href + + scheme = uri.scheme if scheme.nil? or scheme.empty? and uri.respond_to? :scheme + + host = uri.host if host.nil? or host.empty? and uri.respond_to? :host + + path = ( + (/\/$/.match(uri.path)) ? + '%s%s' % [uri.path,path] : + '%s/%s' % [File.dirname(uri.path),path] + ) unless /^\//.match path + + '%s://%s%s' % [scheme, host, path] + end + + def fetch_uri(uri) + logger.info "Requesting: %s" % @url if logger + + case uri.scheme + when 'file' + File.read uri.path + when /^http[s]?/ + fetch_http uri + else + raise BadUrlError, "Unknown URI scheme for the url: #{@url}" + end + end + + def fetch_http(uri) + fetch_attempts = 0 + + begin + # This handles the redirects for us + resp, data = Net::HTTP.new( uri.host, uri.port).get uri.request_uri, nil + + if resp.response.code == "200" + # Check for gzip, and decode: + data = Zlib::GzipReader.new(StringIO.new(data)).read if resp.response.header['Content-Encoding'] == 'gzip' + + data + elsif resp.response['Location'] + redirect_to = resp.response['Location'] + + fetch_uri URI.parse(url_from_href(redirect_to)) + else + # Sometimes Craigslist seems to return 404's for no good reason, and a subsequent fetch will give you what you want + error_description = 'Unable to fetch "%s" (%s)' % [ @url, resp.response.code ] + + logger.info error_description if logger + + raise FetchError, error_description + end + rescue FetchError,Timeout::Error,Errno::ECONNRESET => err + logger.info 'Timeout error while requesting "%s"' % @url if logger and err.class == Timeout::Error + logger.info 'Connection reset while requesting "%s"' % @url if logger and err.class == Errno::ECONNRESET + + fetch_attempts += 1 + + if fetch_attempts <= self.retries_on_fetch_fail + sleep self.sleep_between_fetch_retries if self.sleep_between_fetch_retries + logger.info 'Retrying fetch ....' if logger + retry + else + raise err + end + end + end + + def html + @html ||= Hpricot.parse fetch_uri(uri) if uri + @html + end + end +end \ No newline at end of file diff --git a/roodi.yml b/roodi.yml new file mode 100644 index 0000000..adb3796 --- /dev/null +++ b/roodi.yml @@ -0,0 +1,15 @@ +# AssignmentInConditionalCheck: { } +# CaseMissingElseCheck: { } +ClassLineCountCheck: { line_count: 300 } +ClassNameCheck: { pattern: !ruby/regexp /^[A-Z][a-zA-Z0-9]*$/ } +# ClassVariableCheck: { } +CyclomaticComplexityBlockCheck: { complexity: 12 } +CyclomaticComplexityMethodCheck: { complexity: 14 } +EmptyRescueBodyCheck: { } +ForLoopCheck: { } +MethodLineCountCheck: { line_count: 50 } +MethodNameCheck: { pattern: !ruby/regexp /^[_a-z<>=\[|+-\/\*`]+[_a-z0-9_<>=~@\[\]]*[=!\?]?$/ } +ModuleLineCountCheck: { line_count: 300 } +ModuleNameCheck: { pattern: !ruby/regexp /^[A-Z][a-zA-Z0-9]*$/ } +ParameterNumberCheck: { parameter_count: 5 } +