Permalink
Browse files

Version bump to 0.1.0

  • Loading branch information...
1 parent 7b83c14 commit 99e1aaa11990bb11c75e15a0631e54f014207540 @alexrabarts committed Mar 10, 2009
Showing with 450 additions and 0 deletions.
  1. +1 −0 .gitignore
  2. +22 −0 LICENSE
  3. +53 −0 Rakefile
  4. +4 −0 VERSION.yml
  5. +169 −0 lib/big_sitemap.rb
  6. +172 −0 test/big_sitemap_test.rb
  7. +18 −0 test/fixtures/test_model.rb
  8. +11 −0 test/test_helper.rb
View
@@ -0,0 +1 @@
+._*
View
22 LICENSE
@@ -0,0 +1,22 @@
+(The MIT License)
+
+Copyright (c) 2009 Stateless Systems (http://statelesssystems.com)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
View
@@ -0,0 +1,53 @@
+require 'rake'
+
+begin
+ require 'jeweler'
+ Jeweler::Tasks.new do |s|
+ s.name = "big_sitemap"
+ s.summary = %Q{A Sitemap generator specifically designed for large sites (although it works equally well with small sites)}
+ s.email = "alexrabarts@gmail.com"
+ s.homepage = "http://github.com/alexrabarts/big_sitemap"
+ s.description = "A Sitemap generator specifically designed for large sites (although it works equally well with small sites)"
+ s.authors = ["Alex Rabarts"]
+ s.add_dependency 'builder', ['>=2.1.2']
+ s.add_dependency 'extlib', ['>=0.9.9']
+ end
+rescue LoadError
+ puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
+end
+
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+ rdoc.rdoc_dir = 'rdoc'
+ rdoc.title = 'big_sitemap'
+ rdoc.options << '--line-numbers' << '--inline-source'
+ rdoc.rdoc_files.include('README*')
+ rdoc.rdoc_files.include('lib/**/*.rb')
+end
+
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |t|
+ t.libs << 'lib' << 'test'
+ t.pattern = 'test/**/*_test.rb'
+ t.verbose = false
+end
+
+begin
+ require 'rcov/rcovtask'
+ Rcov::RcovTask.new do |t|
+ t.libs << 'test'
+ t.test_files = FileList['test/**/*_test.rb']
+ t.verbose = true
+ end
+rescue LoadError
+ puts "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
+end
+
+begin
+ require 'cucumber/rake/task'
+ Cucumber::Rake::Task.new(:features)
+rescue LoadError
+ puts "Cucumber is not available. In order to run features, you must: sudo gem install cucumber"
+end
+
+task :default => :test
View
@@ -0,0 +1,4 @@
+---
+:minor: 1
+:patch: 0
+:major: 0
View
@@ -0,0 +1,169 @@
+require 'net/http'
+require 'uri'
+require 'zlib'
+require 'builder'
+require 'extlib'
+
+class BigSitemap
+ def initialize(options)
+ document_root = options.delete(:document_root)
+
+ if document_root.nil?
+ if defined? RAILS_ROOT
+ document_root = "#{RAILS_ROOT}/public"
+ elsif defined? Merb
+ document_root = "#{Merb.root}/public"
+ end
+ end
+
+ raise ArgumentError, 'Document root must be specified with the :document_root option' if document_root.nil?
+
+ @base_url = options.delete(:base_url)
+ @max_per_sitemap = options.delete(:max_per_sitemap) || 50000
+ @batch_size = options.delete(:batch_size) || 1001 # TODO: Set this to 1000 once DM offset 37000 bug is fixed
+ @web_path = options.delete(:path) || 'sitemaps'
+ @update_google = options[:update_google].nil? ? true : options.delete(:update_google)
+ @file_path = "#{document_root}/#{@web_path}"
+ @sources = []
+
+ raise(
+ ArgumentError,
+ 'Batch size (:batch_size) must be less than or equal to maximum URLs per sitemap (:max_per_sitemap)'
+ ) if @batch_size > @max_per_sitemap
+
+ unless File.exists? @file_path
+ Dir.mkdir(@file_path)
+ end
+ end
+
+ def add(options)
+ raise ArgumentError, ':model and :path options must be provided' unless options[:model] && options[:path]
+ @sources << options
+ end
+
+ def generate
+ paths = []
+ sitemaps = []
+
+ @sources.each do |source|
+ klass = source[:model]
+
+ count_method = pick_method(klass, [:count_for_sitemap, :count])
+ find_method = pick_method(klass, [:find_for_sitemap, :all])
+ raise ArgumentError, "#{klass} must provide a count_for_sitemap class method" if count_method.nil?
+ raise ArgumentError, "#{klass} must provide a find_for_sitemap class method" if find_method.nil?
+
+ count = klass.send(count_method)
+ num_sitemaps = 1
+ num_batches = 1
+
+ if count > @batch_size
+ num_batches = (count.to_f / @batch_size.to_f).ceil
+ num_sitemaps = (count.to_f / @max_per_sitemap.to_f).ceil
+ end
+ batches_per_sitemap = num_batches.to_f / num_sitemaps.to_f
+
+ # Update the @sources hash so that the index file knows how many sitemaps to link to
+ source[:num_sitemaps] = num_sitemaps
+
+ for sitemap_num in 1..num_sitemaps
+ # Work out the start and end batch numbers for this sitemap
+ batch_num_start = sitemap_num == 1 ? 1 : ((sitemap_num * batches_per_sitemap).ceil - batches_per_sitemap + 1).to_i
+ batch_num_end = (batch_num_start + [batches_per_sitemap, num_batches].min).floor - 1
+
+ # Stream XML output to a file
+ filename = "sitemap_#{Extlib::Inflection::underscore(klass.to_s)}"
+ filename << "_#{sitemap_num}" if num_sitemaps > 1
+
+ gz = gz_writer("#{filename}.xml.gz")
+
+ xml = Builder::XmlMarkup.new(:target => gz)
+ xml.instruct!
+ xml.urlset(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do
+ for batch_num in batch_num_start..batch_num_end
+ offset = ((batch_num - 1) * @batch_size)
+ limit = (count - offset) < @batch_size ? (count - offset - 1) : @batch_size
+ find_options = num_batches > 1 ? {:limit => limit, :offset => offset} : {}
+
+ klass.send(find_method, find_options).each do |r|
+ last_mod_method = pick_method(
+ r,
+ [:updated_at, :updated_on, :updated, :created_at, :created_on, :created]
+ )
+ last_mod = last_mod_method.nil? ? Time.now : r.send(last_mod_method)
+
+ param_method = pick_method(r, [:to_param, :id])
+ raise ArgumentError, "#{klass} must provide a to_param instance method" if param_method.nil?
+
+ path = {:url => "#{source[:path]}/#{r.send(param_method)}", :last_mod => last_mod}
+
+ xml.url do
+ xml.loc(@base_url + path[:url])
+ xml.lastmod(path[:last_mod].strftime('%Y-%m-%d')) unless path[:last_mod].nil?
+ xml.changefreq('weekly')
+ end
+ end
+ end
+ end
+
+ gz.close
+ end
+
+ end
+
+ generate_sitemap_index
+ update_google if @update_google
+ end
+
+ private
+ def pick_method(klass, candidates)
+ method = nil
+ candidates.each do |candidate|
+ if klass.respond_to? candidate
+ method = candidate
+ break
+ end
+ end
+ method
+ end
+
+ def gz_writer(filename)
+ Zlib::GzipWriter.new(File.open("#{@file_path}/#{filename}", 'w+'))
+ end
+
+ def sitemap_index_filename
+ 'sitemap_index.xml.gz'
+ end
+
+ # Create a sitemap index document
+ def generate_sitemap_index
+ xml = ''
+ builder = Builder::XmlMarkup.new(:target => xml)
+ builder.instruct!
+ builder.sitemapindex(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do
+ @sources.each do |source|
+ num_sitemaps = source[:num_sitemaps]
+ for i in 1..num_sitemaps
+ loc = "#{@base_url}/#{@web_path}/sitemap_#{Extlib::Inflection::underscore(source[:model].to_s)}"
+ loc << "_#{i}" if num_sitemaps > 1
+ loc << '.xml.gz'
+
+ builder.sitemap do
+ builder.loc(loc)
+ builder.lastmod(Time.now.strftime('%Y-%m-%d'))
+ end
+ end
+ end
+ end
+
+ gz = gz_writer(sitemap_index_filename)
+ gz.write(xml)
+ gz.close
+ end
+
+ # Notify Google of the new sitemap index file
+ def update_google
+ sitemap_uri = URI.escape("#{@base_url}/#{@web_path}/#{sitemap_index_filename}")
+ Net::HTTP.get('www.google.com', "/webmasters/tools/ping?sitemap=#{sitemap_uri}")
+ end
+end
Oops, something went wrong.

0 comments on commit 99e1aaa

Please sign in to comment.