Skip to content

Commit

Permalink
Docs & Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
buren committed Apr 2, 2015
1 parent 94dc9b4 commit ea433be
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 56 deletions.
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# SiteMapper

[![Code Climate](https://codeclimate.com/github/buren/site_mapper.png)](https://codeclimate.com/github/buren/site_mapper)
[![Gem Version](https://badge.fury.io/rb/site_mapper.svg)](http://badge.fury.io/rb/site_mapper)
[![Coverage Status](https://img.shields.io/coveralls/buren/site_mapper.svg)](https://coveralls.io/r/buren/site_mapper)
[![Docs badge](https://inch-ci.org/github/buren/site_mapper.svg?branch=master)](http://www.rubydoc.info/github/buren/site_mapper/master)
[![Build Status](https://travis-ci.org/buren/site_mapper.svg?branch=master)](https://travis-ci.org/buren/site_mapper)
[![Dependency Status](https://gemnasium.com/buren/site_mapper.svg)](https://gemnasium.com/buren/site_mapper)
[![Coverage Status](https://img.shields.io/coveralls/buren/site_mapper.svg)](https://coveralls.io/r/buren/site_mapper)
[![Gem Version](https://badge.fury.io/rb/site_mapper.svg)](http://badge.fury.io/rb/site_mapper)

Map all links on a given site.
SiteMapper will try to respect `/robots.txt`
Expand Down Expand Up @@ -45,6 +46,8 @@ end

## Contributing

Contributions, feedback and suggestions are very welcome.

1. Fork it
2. Create your feature branch (`git checkout -b my-new-feature`)
3. Commit your changes (`git commit -am 'Add some feature'`)
Expand All @@ -54,3 +57,7 @@ end
## Notes

* Special thanks to the [robots](https://rubygems.org/gems/robots) gem, which provided the bulk of the code in `lib/robots.rb`

## License

[MIT License](LICENSE)
15 changes: 12 additions & 3 deletions lib/site_mapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,19 @@ module SiteMapper
# @param [String] link to domain
# @param [Hash] options hash
# @example Collect all URLs from example.com
# SiteMapper.map('example.com')
# SiteMapper.map('example.com')
# @example Collect all URLs from example.com with custom User-agent
# SiteMapper.map('example.com', user_agent: 'MyUserAgent')
# @example Collect all URLs from example.com with custom logger class
# class MyLogger
# def self.log(msg); puts msg;end
# def self.err_log(msg); puts msg;end
# end
# SiteMapper.map('example.com', logger: MyLogger)
def self.map(link, options = {})
set_logger(options[:logger])
Crawler.collect_urls(link) { |url| yield(url) if block_given? }
set_logger(options.delete(:logger))
options = { user_agent: USER_AGENT }.merge(options)
Crawler.collect_urls(link, options) { |url| yield(url) if block_given? }
end

# Set logger.
Expand Down
19 changes: 13 additions & 6 deletions lib/site_mapper/crawl_url.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@ module SiteMapper
class CrawlUrl
attr_reader :resolved_base_url, :base_hostname

# Too many request error message
TOO_MANY_REQUEST_MSG = "You're being challenged with a 'too many requests' captcha"

# @param [String] base_url
def initialize(base_url)
@resolved_base_url = Request.resolve_url(base_url, with_query: false)
uri = URI.parse(Request.resolve_url(base_url))
host = uri.hostname
protocol = uri.port == 443 ? 'https://' : 'http://'
@resolved_base_url = "#{protocol}#{host}"
@base_hostname = URI.parse(@resolved_base_url).hostname
@resolved_base_url.prepend('http://') unless @resolved_base_url.start_with?('http')
end

# Given a link it constructs the absolute path,
Expand All @@ -20,11 +25,11 @@ def initialize(base_url)
# cu.absolute_url_from('/path', 'example.com/some/path')
# # => http://example.com/some/path
def absolute_url_from(raw_url, get_url)
return nil unless eligible_url?(raw_url)
parsed_url = URI.parse(raw_url) rescue URI.parse('')
if parsed_url.relative?
return unless eligible_url?(raw_url)
parsed_url = URI.parse(raw_url) rescue false
if parsed_url && parsed_url.relative?
url_from_relative(raw_url, get_url)
elsif same_domain?(raw_url, @resolved_base_url)
elsif parsed_url && same_domain?(raw_url, @resolved_base_url)
raw_url
else
nil
Expand Down Expand Up @@ -70,8 +75,10 @@ def eligible_url?(href)
return false if href.nil? || href.empty?
dont_start = %w(javascript: callto: mailto: tel: skype: facetime: wtai: #)
dont_include = %w(/email-protection#)
err_include = %w(/sorry/IndexRedirect?)
dont_end = %w(.zip .rar .pdf .exe .dmg .pkg .dpkg .bat)

err_include.each { |pattern| fail TOO_MANY_REQUEST_MSG if href.include?(pattern) }
dont_start.each { |pattern| return false if href.start_with?(pattern) }
dont_include.each { |pattern| return false if href.include?(pattern) }
dont_end.each { |pattern| return false if href.end_with?(pattern) }
Expand Down
45 changes: 35 additions & 10 deletions lib/site_mapper/crawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,37 @@
module SiteMapper
# Crawls a given site.
class Crawler
# Default options
OPTIONS = {
resolve: false,
sleep_length: 0.5,
max_requests: Float::INFINITY
}

# @param [String] url base url for crawler
# @param [Hash] options hash, resolve key (optional false by default)
# add user_agent key to specify custom User-agent
# @example Create crawler with custom User-agent
# Crawler.new('example.com', user_agent: 'MyUserAgent')
# @example Create crawler and resolve all urls
# Crawler.new('example.com', resolve: true)
# @example Create crawler and sleep 1 second between each request
# Crawler.new('example.com', sleep_length: 1)
# @example Create crawler and perform max 3 requests
# Crawler.new('example.com', max_requests: 3)
def initialize(url, options = {})
@base_url = Request.resolve_url(url)
@options = { resolve: false }.merge(options)
@options = OPTIONS.dup.merge(options)
@user_agent = @options.fetch(:user_agent)
@crawl_url = CrawlUrl.new(@base_url)
@fetch_queue = CrawlQueue.new
@processed = Set.new
@robots = nil
end

# @see #collect_urls
def self.collect_urls(base_url)
new(base_url).collect_urls { |url| yield(url) }
def self.collect_urls(*args)
new(*args).collect_urls { |url| yield(url) }
end

# Collects all links on domain for domain.
Expand All @@ -32,13 +49,16 @@ def self.collect_urls(base_url)
# end
def collect_urls
@fetch_queue << @crawl_url.resolved_base_url
until @fetch_queue.empty?
until @fetch_queue.empty? || @processed.length >= @options[:max_requests]
url = @fetch_queue.pop
yield(url)
page_links(url)
end
Logger.log "Crawling finished, #{@processed.length} links found"
@processed.to_a
result = @processed + @fetch_queue
Logger.log "Crawling finished:"
Logger.log "Processed links: #{@processed.length}"
Logger.log "Found links: #{result.length}"
result.to_a
rescue Interrupt, IRB::Abort
Logger.err_log 'Crawl interrupted.'
@fetch_queue.to_a
Expand All @@ -48,7 +68,8 @@ def collect_urls

def page_links(get_url)
Logger.log "Queue length: #{@fetch_queue.length}, Parsing: #{get_url}"
link_elements = Request.get_page(get_url).css('a') rescue []
link_elements = Request.document(get_url, user_agent: @options[:user_agent]).css('a')
wait
@processed << get_url
link_elements.each do |page_link|
url = @crawl_url.absolute_url_from(page_link.attr('href'), get_url)
Expand All @@ -62,7 +83,7 @@ def eligible_for_queue?(url)

def robots
return @robots unless @robots.nil?
robots_body = Request.get_response_body("#{@base_url}/robots.txt")
robots_body = Request.response_body("#{@base_url}/robots.txt", user_agent: @options[:user_agent])
@robots = Robots.new(robots_body, URI.parse(@base_url).host, SiteMapper::USER_AGENT)
@robots
end
Expand All @@ -71,13 +92,17 @@ def resolve(url)
@options[:resolve] ? Request.resolve_url(url) : url
end

def wait
sleep @options[:sleep_length]
end

# Queue of urls to be crawled.
class CrawlQueue
# @return [Set] that exends EnumerablePop module
def self.new
Set.new.extend(EnumerablePop)
end

# Add pop method when added to class.
# The class that extends this module need to implement #first and #delete.
module EnumerablePop
Expand All @@ -90,5 +115,5 @@ def pop
end
end
end
end
end
end
55 changes: 28 additions & 27 deletions lib/site_mapper/request.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,61 +11,62 @@ class Request
class << self
# Given an URL get it then parse it with Nokogiri::HTML.
# @param [String] url
# @param [Hash] options
# @return [Nokogiri::HTML] a nokogiri HTML object
def get_page(url)
Nokogiri::HTML(Request.get_response_body(url))
def document(url, options = {})
Nokogiri::HTML(Request.response_body(url, options))
end

# Given an URL get the response.
# @param [String] url
# @param [Boolean] resolve (optional and false by default)
# @param [Hash] options
# @return [Net::HTTPOK] if response is successfull, raises error otherwise
# @example get example.com and resolve the URL
# Request.get_response('example.com', true)
# Request.response('example.com', resolve: true)
# @example get example.com and do *not* resolve the URL
# Request.get_response('http://example.com')
# Request.get_response('http://example.com', false)
def get_response(url, resolve = false)
resolved_url = resolve ? resolve_url(url) : url
# Request.response('http://example.com')
# @example get example.com and resolve the URL
# Request.response('http://example.com', resolve: true)
# @example get example.com and resolve the URL and use a custom User-Agent
# Request.response('http://example.com', resolve: true, user_agent: 'MyUserAgent')
def response(url, options = {})
options = {
resolve: false,
user_agent: SiteMapper::USER_AGENT
}.merge(options)
resolved_url = options[:resolve] ? resolve_url(url) : url
uri = URI.parse(resolved_url)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true if resolved_url.include?('https://')
http.use_ssl = true if resolved_url.start_with?('https://')

request = Net::HTTP::Get.new(uri.request_uri)
request['User-Agent'] = SiteMapper::USER_AGENT
request['User-Agent'] = options[:user_agent]
http.request(request)
end

# Get response body, rescues with nil if an exception is raised.
# @see Request#get_response
def get_response_body(*args)
get_response(*args).body rescue nil
# @see Request#response
def response_body(*args)
response(*args).body
end

# Resolve an URL string and follows redirects.
# if the URL can't be resolved the original URL is returned.
# @param [String] url
# @param [Hash] options hash, with_query key (optional and true by default)
# @param [String] url to resolve
# @return [String] a URL string that potentially is a redirected URL
# @example Resolve google.com
# resolve_url('google.com')
# # => 'https://www.google.com'
def resolve_url(url, options = {})
options = { with_query: true }.merge(options)
def resolve_url(url)
resolved = UrlResolver.resolve(url)
resolved = remove_query(resolved) unless options[:with_query]
resolved = resolved.prepend('http://') unless has_protocol?(resolved)
resolved
end

# Removes query string from URL string.
# @param [String] url
# @return [String] an URL string without query
# @example Removes query string
# remove_query('example.com/path?q=keyword')
# # => 'example.com/path'
def remove_query(url)
index = url.index('?')
index.nil? ? url : url[0...index]
private

def has_protocol?(url)
url.start_with?('https://') || url.start_with?('http://')
end
end
end
Expand Down
6 changes: 0 additions & 6 deletions lib/site_mapper/robots.rb
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ def allowed?(uri)
host = uri.host
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
@parsed[host].allowed?(uri, @user_agent)
rescue
true
end

# @return [Array] array of sitemaps defined in robots.txt
Expand All @@ -146,8 +144,6 @@ def sitemaps
host = @hostname
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
@parsed[host].sitemaps
rescue
[]
end

# @param [String, URI] uri String or URI get other_values from
Expand All @@ -159,8 +155,6 @@ def other_values
host = @hostname
@parsed[host] ||= ParsedRobots.new(@robots_txt, @user_agent)
@parsed[host].other_values
rescue
{}
end

private
Expand Down
2 changes: 1 addition & 1 deletion lib/site_mapper/version.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module SiteMapper
# Gem version
VERSION = '0.0.10'
VERSION = '0.0.11'
end
2 changes: 1 addition & 1 deletion spec/site_mapper/logger_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def self.err_log(msg);"err: #{msg}";end

describe SiteMapper::Logger do
before(:all) { SiteMapper::Logger.use_logger(TestLogger) }

let(:logger) { SiteMapper::Logger }
let(:system_logger) { SiteMapper::Logger::SystemOutLogger }
let(:nil_logger) { SiteMapper::Logger::NilLogger }
Expand Down

0 comments on commit ea433be

Please sign in to comment.