Skip to content

Commit

Permalink
Make lastfm work with new pagination/recursing scheme
Browse files Browse the repository at this point in the history
  • Loading branch information
Philip (flip) Kromer committed Sep 1, 2009
1 parent b161305 commit d2b80dc
Show file tree
Hide file tree
Showing 13 changed files with 677 additions and 418 deletions.
Expand Up @@ -8,7 +8,6 @@
Monkeyshines.load_global_options!
Monkeyshines.load_cmdline_options!
Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
default_tube = Monkeyshines::CONFIG[:handle].to_s.gsub!(/[_\.]/,'-')

#
# * jobs stream from an edamame job queue.
Expand All @@ -24,7 +23,7 @@
#
scraper = Monkeyshines::Runner.new({
:log => { :iters => 600, :time => 150, :dest => nil }, # Monkeyshines::CONFIG[:handle]
:source => { :type => TwitterSearchRequestStream, :tube => default_tube,
:source => { :type => TwitterSearchRequestStream,
:queue => { :uris => ['localhost:11240'], },
:store => { :uri => ':11241', }, },
:dest => { :type => :chunked_flat_file_store, :rootdir => WORK_DIR },
Expand Down
9 changes: 3 additions & 6 deletions lib/wuclan/lastfm/scrape.rb
@@ -1,15 +1,12 @@
require 'monkeyshines/scrape_request/raw_json_contents'
require 'wuclan/lastfm/scrape/base.rb'
require 'wuclan/lastfm/scrape/concrete.rb'

module Wuclan
module Lastfm
module Scrape
autoload :Base, 'wuclan/lastfm/scrape/base.rb'
autoload :LastfmJob, 'wuclan/lastfm/scrape/lastfm_job.rb'
autoload :LastfmRequestStream, 'wuclan/lastfm/scrape/lastfm_request_stream.rb'
autoload :LastfmArtistShoutsRequest, 'wuclan/lastfm/scrape/base.rb'
autoload :LastfmArtistTopFansRequest, 'wuclan/lastfm/scrape/base.rb'
autoload :LastfmArtistTopTracksRequest, 'wuclan/lastfm/scrape/base.rb'
autoload :LastfmTrackTopFansRequest, 'wuclan/lastfm/scrape/base.rb'
autoload :LastfmUserRequest, 'wuclan/lastfm/scrape/base.rb'
end
end
end
393 changes: 123 additions & 270 deletions lib/wuclan/lastfm/scrape/base.rb

Large diffs are not rendered by default.

208 changes: 208 additions & 0 deletions lib/wuclan/lastfm/scrape/concrete.rb

Large diffs are not rendered by default.

31 changes: 31 additions & 0 deletions lib/wuclan/lastfm/scrape/lastfm_job.rb
@@ -0,0 +1,31 @@
require 'wuclan/lastfm/scrape/recursive_requests'

module Wuclan
module Lastfm
module Scrape

Base.class_eval do
#
# Pagination
#
include Monkeyshines::ScrapeRequestCore::Paginating
include Monkeyshines::ScrapeRequestCore::PaginatedWithLimit
# include Monkeyshines::ScrapeRequestCore::PaginatedWithRate

# Items to get each re-visit. If there are up to 50 items per page,
# target_items_per_job of 1000 will try to reschedule so that its return visit
# makes about twenty page requests.
self.target_items_per_job = 150

# creates the paginated request
def request_for_page page
req = TwitterSearchRequest.new(obj[:key], page)
req.url << "&rpp=#{req.max_items}"
req.url << "&max_id=#{sess_span.min - 1}" if sess_span.min
req
end
end

end
end
end
154 changes: 154 additions & 0 deletions lib/wuclan/lastfm/scrape/recursive_requests.rb
@@ -0,0 +1,154 @@
require 'monkeyshines/scrape_request/raw_json_contents'
module Wuclan
module Lastfm
module Scrape

#
# Simple requestables
#

class LastfmArtistInfoRequest
self.requestables = [
LastfmArtistSimilarRequest,
LastfmArtistTopAlbumsRequest,
LastfmArtistTopTracksRequest,
LastfmArtistShoutsRequest,
LastfmArtistEventsRequest,
LastfmArtistTopFansRequest,
# LastfmArtistTopTagsRequest, LastfmArtistImagesRequest, LastfmArtistPodcastRequest,
]
end

class LastfmTrackInfoRequest
self.requestables = [LastfmTrackSimilarRequest, LastfmTrackTopFansRequest, LastfmTrackTopTagsRequest]
end
class LastfmEventInfoRequest
self.requestables = [LastfmEventAttendeesRequest, LastfmEventShoutsRequest]
end
class LastfmUserTopTagsRequest # LastfmUserInfoRequest
self.requestables = [
# LastfmUserTopTagsRequest,
LastfmUserEventsRequest,
LastfmUserPastEventsRequest,
LastfmUserFriendsRequest, # recenttracks
LastfmUserNeighboursRequest,
LastfmUserLovedTracksRequest,
LastfmUserRecentTracksRequest,
LastfmUserShoutsRequest,
LastfmUserTopAlbumsRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
LastfmUserTopArtistsRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
LastfmUserTopTracksRequest, # period (Optional) : overall | 7day | 3month | 6month | 12month
# uninteresting(?): LastfmUserPlaylistsRequest, LastfmUserWeeklyAlbumChartRequest, LastfmUserWeeklyArtistChartRequest, LastfmUserWeeklyChartListRequest, LastfmUserWeeklyTrackChartRequest,
# needs auth: LastfmUserInfoRequest, LastfmUserRecentStationsRequest, LastfmUserRecommendedArtistsRequest, LastfmUserRecommendedEventsRequest,
]
end

#
# Recursive requests based on contents
#

module LastfmTimeWindowed
def recursive_requests *args, &block
super(*args, &block)
unless (identifier =~ /&period=/)
['7day', '3month', '6month'].each do |period|
req = self.class.new(identifier+"&period=#{period}")
req.generation = generation.to_i
yield req
end
end
end
end
[LastfmUserTopArtistsRequest, LastfmUserTopAlbumsRequest, LastfmUserTopTracksRequest
].each do |klass|
klass.class_eval do include LastfmTimeWindowed ; end
end

module LastfmContainsArtists
def recursive_requests *args, &block
super(*args, &block)
items.each do |artist|
req = LastfmArtistInfoRequest.new(url_encode(artist['name']))
req.generation = generation.to_i + 1
yield req
end
end
end
[ LastfmArtistSimilarRequest, LastfmGeoTopArtistsRequest, LastfmTagTopArtistsRequest,
LastfmUserRecommendedArtistsRequest, LastfmUserTopArtistsRequest,
].each do |klass|
klass.class_eval do include LastfmContainsArtists ; end
end

module LastfmContainsAlbums
def recursive_requests *args, &block
super(*args, &block)
items.each do |item|
obj_artist = item['artist']['name'] || item['artist']['#text'] rescue nil
req = LastfmAlbumInfoRequest.from_identifier_hash(
item['name'], :artist => obj_artist, :mbid => item['mbid'] )
req.generation = generation.to_i + 1
yield req
end
end
end
[ LastfmArtistTopAlbumsRequest, LastfmTagTopAlbumsRequest, LastfmUserTopAlbumsRequest,
].each do |klass|
klass.class_eval do include LastfmContainsAlbums ; end
end

module LastfmContainsTracks
def recursive_requests *args, &block
super(*args, &block)
items.each do |track|
obj_artist = track['artist']['name'] || track['artist']['#text'] rescue nil
req = LastfmTrackInfoRequest.from_identifier_hash(
track['name'], :artist => obj_artist, :mbid => track['mbid'])
req.generation = generation.to_i + 1
yield req
end
end
end
[ LastfmArtistTopTracksRequest, LastfmGeoTopTracksRequest, LastfmTagTopTracksRequest,
LastfmTrackSimilarRequest, LastfmUserLovedTracksRequest, LastfmUserRecentTracksRequest,
LastfmUserTopTracksRequest,
].each do |klass|
klass.class_eval do include LastfmContainsTracks ; end
end

module LastfmContainsEvents
def recursive_requests *args, &block
super(*args, &block)
items.each do |event|
req = LastfmEventInfoRequest.new(event['id'])
req.generation = generation.to_i + 1
yield req
end
end
end
[ LastfmArtistEventsRequest, LastfmGeoEventsRequest, LastfmUserEventsRequest,
LastfmUserPastEventsRequest, LastfmUserRecommendedEventsRequest, LastfmVenueEventsRequest,
LastfmVenuePastEventsRequest,
].each do |klass|
klass.class_eval do include LastfmContainsEvents ; end
end

module LastfmContainsUsers
def recursive_requests *args, &block
super(*args, &block)
items.each do |user|
req = LastfmUserTopTagsRequest.new(url_encode(user['name']))
req.generation = generation.to_i + 1
yield req
end
end
end
[ LastfmArtistTopFansRequest, LastfmEventAttendeesRequest, LastfmGroupMembersRequest,
LastfmTrackTopFansRequest, LastfmUserFriendsRequest, LastfmUserNeighboursRequest,
].each do |klass|
klass.class_eval do include LastfmContainsUsers ; end
end

end
end
end
7 changes: 4 additions & 3 deletions lib/wuclan/twitter/scrape.rb
@@ -1,9 +1,10 @@
module Wuclan
module Twitter
module Scrape
autoload :TwitterSearchRequest, 'wuclan/twitter/scrape/twitter_search_request'
autoload :TwitterSearchJob, 'wuclan/twitter/scrape/twitter_search_job'

# Search API
autoload :TwitterSearchRequest, 'wuclan/twitter/scrape/twitter_search_request'
autoload :TwitterSearchJob, 'wuclan/twitter/scrape/twitter_search_job'
# Main API
autoload :Base, 'wuclan/twitter/scrape/base'
autoload :TwitterUserRequest, 'wuclan/twitter/scrape/twitter_user_request'
autoload :TwitterFollowersRequest, 'wuclan/twitter/scrape/twitter_followers_request'
Expand Down
2 changes: 1 addition & 1 deletion lib/wuclan/twitter/scrape/base.rb
Expand Up @@ -57,7 +57,7 @@ def response= response
#

# creates the paginated request
def request_for_page page, pageinfo
def request_for_page page, pageinfo=nil
(page.to_i > 1) ? self.class.new(twitter_user_id, page) : self
end

Expand Down
30 changes: 30 additions & 0 deletions lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb
@@ -0,0 +1,30 @@
#
# #
# #
# # query terms must be URL-encoded
# # (use '+' for space; %23 # %27 ' etc)
# #
# def initialize *args
# super *args
# raise "Query term missing" if self.query_term.blank?
# self[:query_term].strip!
# [:priority, :prev_items, :prev_span_min, :prev_span_max].each{|attr| self[attr] = self[attr].to_i if self[attr] }
# self[:prev_rate] = self[:prev_rate].to_f
# self[:priority] = DEFAULT_PRIORITY if (self[:priority] == 0)
# self[:prev_rate] = nil if (self[:prev_rate] < 1e-6)
# end
#
# class TwitterSearchStream < Monkeyshines::RequestStream::SimpleRequestStream
# #
# # for the given user_id,
# # gets the user
# # and then each of the requests in more_request_klasses
# #
# def each *args, &block
# request_store.each do |*raw_job_args|
# job = klass.new(*raw_job_args)
# # do_faking(job)
# job.each_request(*args, &block)
# end
# end
# end
48 changes: 25 additions & 23 deletions lib/wuclan/twitter/scrape/twitter_search_job.rb
@@ -1,23 +1,25 @@
# require 'monkeyshines/scrape_request'
# require 'monkeyshines/scrape_request/paginated'
# require 'monkeyshines/scrape_request/raw_json_contents'
# module Wuclan
# module Twitter
# module Scrape
# #
# # TwitterSearchJob for the twitter Search API
# #
# # * Manages a series of paginated requests from first result back to last item in
# # previous scrape scrape_job.
# #
# #
# class TwitterSearchJob < Struct.new(
# :query_term,
# :priority,
# :prev_rate, :prev_items, :prev_span_min, :prev_span_max
# )
#
# end
# end
# end
# end
class TwitterSearchJob < Edamame::Job
#
# Pagination
#
include Monkeyshines::ScrapeRequestCore::Paginating
include Monkeyshines::ScrapeRequestCore::PaginatedWithLimit
include Monkeyshines::ScrapeRequestCore::PaginatedWithRate
# API max pages
self.hard_request_limit = 15

# Items to get each re-visit. If there are up to 50 items per page,
# target_items_per_job of 1000 will try to reschedule so that its return visit
# makes about twenty page requests.
#
# For Twitter, 1500 is the max, so 1000 gives a safety margin.
self.target_items_per_job = 1000

# creates the paginated request
def request_for_page page, info=nil
req = TwitterSearchRequest.new(obj[:key], page)
req.url << "&rpp=#{req.max_items}"
req.url << "&max_id=#{sess_span.min - 1}" if sess_span.min
req
end
end
1 change: 0 additions & 1 deletion lib/wuclan/twitter/scrape/twitter_search_request.rb
Expand Up @@ -11,7 +11,6 @@ class TwitterSearchRequest < Monkeyshines::ScrapeRequest
include Monkeyshines::RawJsonContents
# Pagination
include Monkeyshines::ScrapeRequestCore::Paginated
# include Monkeyshines::ScrapeRequestCore::PaginatedWithLimit
# API max items per response
self.max_items = 100
# API max pages
Expand Down
49 changes: 0 additions & 49 deletions lib/wuclan/twitter/scrape/twitter_search_request_stream.rb
@@ -1,33 +1,3 @@
class TwitterSearchJob < Edamame::Job
#
# Pagination
#
include Monkeyshines::ScrapeRequestCore::Paginating
include Monkeyshines::ScrapeRequestCore::PaginatedWithLimit
include Monkeyshines::ScrapeRequestCore::PaginatedWithRate
# API max pages
self.hard_request_limit = 15
# Items to get each scrape. 1500 is the max, so 1000 gives a safety margin.
self.target_items_per_job = 1200

def prev_max() self.scheduling.prev_max end
def prev_max=(val) self.scheduling.prev_max = val end
def prev_items() self.scheduling.prev_items end
def prev_items=(val) self.scheduling.prev_items = val end
def prev_items_rate() self.scheduling.prev_items_rate end
def prev_items_rate=(val) self.scheduling.prev_items_rate = val end
def delay() self.scheduling.delay end
def delay=(val) self.scheduling.delay = val end

# creates the paginated request
def request_for_page page
req = TwitterSearchRequest.new(obj[:key], page)
req.url << "&rpp=#{req.max_items}"
req.url << "&max_id=#{sess_span.min - 1}" if sess_span.min
req
end
end

#
# TwitterSearchJob for the twitter Search API
#
Expand All @@ -49,25 +19,6 @@ def each *args, &block
end



#
# # span of previous scrape
# def prev_span
# @prev_span ||= UnionInterval.new(prev_span_min, prev_max)
# end
# def prev_span= min_max
# self.prev_span_min, self.prev_max = min_max.to_a
# @prev_span = UnionInterval.new(prev_span_min, prev_max)
# end
#
# def key
# query_term
# end
#
#
# def to_hash
# super().merge( 'type' => self.class.to_s, 'key' => query_term )
# end
#
# #
# #
Expand Down

0 comments on commit d2b80dc

Please sign in to comment.