Skip to content

Commit

Permalink
scrape villa names
Browse files Browse the repository at this point in the history
  • Loading branch information
Ukaza Perdana committed Oct 15, 2019
1 parent 01b7e82 commit 6204f25
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 39 deletions.
2 changes: 1 addition & 1 deletion Gemfile
Expand Up @@ -4,7 +4,7 @@

source "https://rubygems.org"

ruby "2.0.0"
ruby "2.6.3"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "mechanize"
44 changes: 27 additions & 17 deletions Gemfile.lock
Expand Up @@ -10,38 +10,48 @@ GIT
GEM
remote: https://rubygems.org/
specs:
domain_name (0.5.24)
connection_pool (2.2.2)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.2)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.6.0.1)
mechanize (2.7.3)
httpclient (2.8.3)
mechanize (2.7.6)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (~> 2.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.4)
net-http-persistent (>= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (2.5)
mini_portile (0.6.2)
net-http-digest_auth (1.4)
net-http-persistent (2.9.4)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
mime-types (3.3)
mime-types-data (~> 3.2015)
mime-types-data (3.2019.1009)
mini_portile2 (2.4.0)
net-http-digest_auth (1.4.1)
net-http-persistent (3.1.0)
connection_pool (~> 2.2)
nokogiri (1.10.4)
mini_portile2 (~> 2.4.0)
ntlm-http (0.1.1)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3 (1.4.1)
sqlite_magic (0.0.6)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.1)
webrobots (0.1.1)
unf_ext (0.0.7.6)
webrobots (0.1.2)

PLATFORMS
ruby

DEPENDENCIES
mechanize
scraperwiki!

RUBY VERSION
ruby 2.6.3p62

BUNDLED WITH
1.17.2
45 changes: 24 additions & 21 deletions scraper.rb
@@ -1,25 +1,28 @@
# This is a template for a Ruby scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# require 'scraperwiki'
# require 'mechanize'
#
# agent = Mechanize.new
#
# # Read in a page
# page = agent.get("http://foo.com")
#
# # Find somehing on the page using css selectors
# p page.at('div.content')
#
# # Write out to the sqlite database using scraperwiki library
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
#
# # An arbitrary query against the database
# ScraperWiki.select("* from data where 'name'='peter'")
require 'scraperwiki'
require 'mechanize'

# You don't have to do things with the Mechanize or ScraperWiki libraries.
# You can use whatever gems you want: https://morph.io/documentation/ruby
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
data = []
agent = Mechanize.new
start_url = "https://www.booking.com/searchresults.en-gb.html?aid=304142&label=gen173nr-1FCAEoggI46AdIM1gEaGiIAQGYAQm4AQfIAQzYAQHoAQH4AQuIAgGoAgO4Auv1le0FwAIB&sid=1876e63eb536b37149c5db2c76040480&tmpl=searchresults&ac_click_type=b&ac_position=0&class_interval=1&clear_ht_id=1&dest_id=835&dest_type=region&from_sf=1&group_adults=2&group_children=0&label_click=undef&nflt=ht_id%3D213%3B&no_rooms=1&percent_htype_hotel=1&raw_dest_type=region&room1=A%2CA&sb_price_type=total&search_selected=1&shw_aparth=1&slp_r_match=0&src=index&srpvid=c3523cc0b5a4008b&ss=Bali%2C%20Indonesia&ss_raw=Bali&ssb=empty&top_ufis=1&rows=25"

page = agent.get(start_url)

loop do
page.search(".sr_item.sr_item_new").each do |item|
hotel_name_link = item.at_css('.hotel_name_link.url')
ScraperWiki.save_sqlite(['href'], {
'href' => hotel_name_link[:href].strip,
'title' => hotel_name_link.at_css('.sr-hotel__name').text.strip,
'coordinates' => item.at_css('.bui-link').attr('data-coords').strip,
'address' => item.at_css('.bui-link > text()').to_s.strip,
'review_score' => item.at_css(".bui-review-score__badge")&.text&.strip
})
end

next_button = page.at_css('.bui-pagination__item.bui-pagination__next-arrow:not(.bui-pagination__item--disabled)')
break if next_button.nil?
page = agent.click(next_button)
end

0 comments on commit 6204f25

Please sign in to comment.