Permalink
Browse files

Going to add persistent job store

  • Loading branch information...
Philip (flip) Kromer
Philip (flip) Kromer committed Jul 17, 2009
1 parent 4cfcbb8 commit 328e1d6a11b3562e6fb5ee1481b8ca63283643e6
View
1 distdb
@@ -0,0 +1,6 @@
+love 65536 3.36366494448618 6698 2686693027 2688691633
+red+sox 65536 0.0113457581992013 1500 2661001994 2688059232
+britney+spears 65536 0.00866753886170806 184 2685103763 2688130850
+hadoop 65536 0.000661831916251315 614 2501794487 2687967783
+infochimps 65536 2.24964286919452e-05 16 2541533220 2683708276
+hapaxlegomenon 65536 0.0 1 2646535741 2646535741

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -9,7 +9,7 @@ module Scrape
# previous scrape session.
#
#
- class Session < Struct.new(
+ class TwitterSearchJob < Struct.new(
:query_term,
:priority,
:prev_rate, :prev_items, :prev_span_min, :prev_span_max
@@ -33,18 +33,18 @@ def initialize *args
# Generate paginated TwitterSearchScrapeRequest
#
def make_request page, pageinfo
- url_str = "http://search.twitter.com/search.json?q=#{query_term}&rpp=#{items_per_page}"
+ url_str = base_url
+ url_str << "&rpp=#{items_per_page}"
url_str << "&max_id=#{unscraped_span.max-1}" if unscraped_span.max
Wuclan::Domains::Twitter::Scrape::TwitterSearchRequest.new url_str
end
- # def initialize query_term, num_items=nil, min_span=nil, max_span=nil, min_timespan=nil, max_timespan=nil
- # self.num_items = num_items.to_i
- # self.prev_span = UnionInterval.new(min_span.to_i, max_span.to_i) if min_span || max_span
- # self.prev_timespan = UnionInterval.new(Time.parse(min_timespan), Time.parse(max_timespan)) rescue nil
- # super(query_term)
- # end
#
+ # Durable handle for this resource, independent of the page/max_id/whatever
+ #
+ def base_url
+ "http://search.twitter.com/search.json?q=#{query_term}"
+ end
end
end
end
View
2 ripd
@@ -29,6 +29,8 @@
# Log every N requests
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iter_interval => 100, :time_interval => 10)
+# # Persist session jobs in distributed DB
+# store = Monkeyshines::ScrapeStore::ReadThruStore.new_from_command_line opts
Twitter::Scrape::Session.hard_request_limit = 5
View
@@ -13,29 +13,31 @@
# Command line options
#
opts = Trollop::options do
- opt :dumpfile_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
- opt :dumpfile_pattern, "Pattern for dump file output", :default => ":dumpfile_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv"
- opt :dumpfile_chunk_time, "Time in seconds to keep dump files open", :default => 60*60*4, :type => Integer
- opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
+ opt :dumpfile_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
+ opt :dumpfile_pattern, "Pattern for dump file output",
+ :default => ":dumpfile_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv"
+ opt :dumpfile_chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer,
+ :default => 60*60*4
+ opt :handle, "Handle to uniquely identify this scrape",
+ :default => 'com.twitter.search'
+ opt :items_per_session, "Desired item count per session",
+ :default => 980
+ opt :min_resched_delay, "Don't run jobs more often than this (in seconds)",
+ :default => 60*1
end
Trollop::die :dumpfile_dir unless opts[:dumpfile_dir]
# Queue of request sessions, with reschedule requests
-request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::Session, RESCHEDULE_GOAL)
+beanstalk_tube = opts[:handle].gsub(/\w+/,'_')
+request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::Session, opts[:items_per_session], opts.slice(:min_resched_delay))
# Scrape Store for completed requests
dumpfile_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dumpfile_pattern], opts.slice(:handle, :dumpfile_dir))
store = Monkeyshines::ScrapeStore::ChunkedFlatFileStore.new dumpfile_pattern, opts[:dumpfile_chunk_time].to_i
# Scrape requests by HTTP
scraper = Monkeyshines::ScrapeEngine::HttpScraper.new Monkeyshines::CONFIG[:twitter]
-# Log every N requests
+# Log every 60 seconds
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time_interval => 60)
-
-RESCHEDULE_GOAL = 650
-Twitter::Scrape::Session.hard_request_limit = 15
-request_queue.min_resched_delay = 180
-
-
request_queue.each do |session|
# Run through all pages for this search term
session.each_request do |req|
@@ -48,7 +50,8 @@
# return it to the session for bookkeeping
response
end
+ sleep 0.5
end
request_queue.finish
-#
-# # # (1..50).map{ begin j = bs.reserve(1) ; rescue Exception => e ; warn e ; break ; end ; if j then q = j.body.gsub(/\t.*/,"") ; queries[q] ||= j.id ; if (queries[q] != j.id) then j.delete end ; j.release 65536, 45 ; puts q ; q end rescue 'error' }
+
+# Twitter::Scrape::Session.hard_request_limit = 15
View
@@ -6,34 +6,53 @@
require 'wukong'
require 'monkeyshines'
require 'wuclan/domains/twitter/scrape' ; include Wuclan::Domains
+
require 'monkeyshines/scrape_engine/http_scraper'
require 'monkeyshines/utils/filename_pattern'
#
# Command line options
#
opts = Trollop::options do
- opt :dumpfile_filename, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
- opt :dumpfile_pattern, "Pattern for dump file output", :default => ":dumpfile_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv"
- opt :dumpfile_chunk_time, "Time in seconds to keep dump files open", :default => 60*60*4, :type => Integer
- opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
+ opt :dumpfile_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
+ opt :dumpfile_pattern, "Pattern for dump file output",
+ :default => ":dumpfile_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv"
+ opt :dumpfile_chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer,
+ :default => 60*60*24
+ opt :handle, "Handle to uniquely identify this scrape",
+ :default => 'com.twitter.search'
+ opt :min_resched_delay, "Don't run jobs more often than this (in seconds)",
+ :default => 60*1
end
Trollop::die :dumpfile_dir unless opts[:dumpfile_dir]
# Queue of request sessions, with reschedule requests
-request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::Session, RESCHEDULE_GOAL)
+# opts[:beanstalk_tube] ||= opts[:handle].gsub(/\w+/,'_')
+request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::Session, opts[:items_per_session], opts.slice(:min_resched_delay)) # , :beanstalk_tube
# Scrape Store for completed requests
dumpfile_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dumpfile_pattern], opts.slice(:handle, :dumpfile_dir))
store = Monkeyshines::ScrapeStore::ChunkedFlatFileStore.new dumpfile_pattern, opts[:dumpfile_chunk_time].to_i
# Scrape requests by HTTP
scraper = Monkeyshines::ScrapeEngine::HttpScraper.new Monkeyshines::CONFIG[:twitter]
-# Log every N requests
+# Log every 60 seconds
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time_interval => 60)
-RESCHEDULE_GOAL = 650
-Twitter::Scrape::Session.hard_request_limit = 15
-request_queue.min_resched_delay = 180
+class Session < Struct.new(
+ :query_term,
+ :priority,
+ :period
+ )
+
+end
+
+
+# %w[
+# http://search.twitter.com/trends/current.format , 60*60
+# http://search.twitter.com/trends/daily.json?date=2009-03-19
+# http://search.twitter.com/trends/weekly.json?date=2009-03-19
+# ]
+
request_queue.each do |session|
@@ -50,5 +69,5 @@
end
end
request_queue.finish
-#
-# # # (1..50).map{ begin j = bs.reserve(1) ; rescue Exception => e ; warn e ; break ; end ; if j then q = j.body.gsub(/\t.*/,"") ; queries[q] ||= j.id ; if (queries[q] != j.id) then j.delete end ; j.release 65536, 45 ; puts q ; q end rescue 'error' }
+
+# Twitter::Scrape::Session.hard_request_limit = 15

0 comments on commit 328e1d6

Please sign in to comment.