Permalink
Browse files

Added real cursor based MongoDB streaming. Much faster.

  • Loading branch information...
1 parent 0fefb71 commit ece5d4b08d33061e10f1266ce07dfc6e72a0dbc8 Joost Hietbrink committed with Burke Libbey Feb 3, 2010
Showing with 38 additions and 23 deletions.
  1. +3 −4 README.rdoc
  2. +35 −19 lib/indexer.rb
View
@@ -13,17 +13,16 @@ collections.
namespace :sphinx do
task :stream => :environment do
- MongoSphinx::Indexer::XMLDocset.stream(Food)
+ MongoSphinx::Indexer::XMLDocset.stream(Food, :attributes => %w(name description), :id_attribute => 'sphinx_id')
end
end
+This uses MongoDB cursor to better stream collection. Instead of offset. See: http://groups.google.com/group/mongodb-user/browse_thread/thread/35f01db45ea3b0bd/96ebc49b511a6b41?lnk=gst&q=skip#96ebc49b511a6b41
+
See inline documentation on usage.
=== TODO
-Use MongoDB cursor to better stream collection. Instead of offset. See:
-http://groups.google.com/group/mongodb-user/browse_thread/thread/35f01db45ea3b0bd/96ebc49b511a6b41?lnk=gst&q=skip#96ebc49b511a6b41
-
Use better output. Eg. $stdout.write or http://ruby-doc.org/stdlib/libdoc/stringio/rdoc/index.html.
== General info
View
@@ -77,23 +77,23 @@ class XMLDocset
# Streams xml of all objects in a klass to the stdout. This makes sure you can process large collections.
#
# Options:
- # All options are passed to the find request except:
- # batch_size - The number of documents in each batch process. Default is 10000.
- # max_offset - The maximum offset. Default is klass.count.
+ # attributes (required) - The attributes that are put in the sphinx xml.
+ # id_attribute (optional) - The attribute to use as id while indexing (should be integer).
+ # If none is specified we'll create a useless one.
#
# Example:
- # MongoSphinx::Indexer::XMLDocset.stream(Document, :fields => 'name,index_helper', :batch_size => 1000)
- # This will create an XML stream to stdout. Each batch output creates the xml for 1000 Documents.
- # The stream will stop at max_offset Document.count (so all are processed).
+ # MongoSphinx::Indexer::XMLDocset.stream(Document, :attributes => %w(title content))
+ # This will create an XML stream to stdout.
#
# Configure in your sphinx.conf like
- # xmlpipe_command = ./script/runner "MongoSphinx::Indexer::XMLDocset.stream(Document)"
+ # xmlpipe_command = ./script/runner "MongoSphinx::Indexer::XMLDocset.stream(Document, :attributes => %w(title content))"
#
def self.stream(klass, options = {})
STDOUT.sync = true # Make sure we really stream..
-
- batch_size = options.delete(:batch_size) || 10000 # The number of documents in each batch process. Default is 10000.
- max_offset = options.delete(:max_offset) || klass.count # The maximum offset. Default is klass.count.
+ attributes = options[:attributes]
+ id_attribute = options[:id_attribute]
+ # raise ArgumentError, 'Missing id_attribute' if id_attribute.nil? # optional
+ raise ArgumentError, 'Missing attributes' if attributes.nil?
puts '<?xml version="1.0" encoding="utf-8"?>'
@@ -109,15 +109,9 @@ def self.stream(klass, options = {})
puts '<sphinx:attr name="csphinx-class" type="multi"/>'
puts '</sphinx:schema>'
- # Content
- offset = 0
- while offset < max_offset
- objects = klass.all(options.merge({:limit => batch_size, :offset => offset}))
- offset = offset + batch_size
-
- objects.each do |object|
- puts XMLDoc.from_object(object)
- end
+ cursor = Mongo::Cursor.new(klass.collection)
+ while document_hash = cursor.next_document
+ XMLDoc.stream_for_hash(document_hash, klass, attributes, id_attribute)
end
puts '</sphinx:docset>'
@@ -206,6 +200,28 @@ class XMLDoc
attr_reader :xml
+ def self.stream_for_hash(hash, klass, attributes, id_attribute = nil)
+ sphinx_compatible_id = hash[id_attribute].to_i unless id_attribute.nil?
+ sphinx_compatible_id ||= hash['_id'].to_s.hex % (2**64) # FIXME. This creates a bogus unique id.
+
+ class_name = klass.to_s
+
+ puts "<sphinx:document id=\"#{sphinx_compatible_id}\">"
+
+ # FIXME: Should we include this?
+ puts '<csphinx-class>'
+ puts MongoSphinx::MultiAttribute.encode(class_name)
+ puts '</csphinx-class>'
+ puts "<classname>#{class_name}</classname>"
+
+ attributes.each do |key|
+ value = hash[key]
+ puts "<#{key}><![CDATA[[#{value}]]></#{key}>"
+ end
+
+ puts '</sphinx:document>'
+ end
+
# Creates a XMLDoc object from the provided CouchRest object.
#
# Parameters:

0 comments on commit ece5d4b

Please sign in to comment.