Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b67373a
Showing
10 changed files
with
1,397 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,4 @@ | |||
rdoc/* | |||
pkg/* | |||
/Gemfile.lock | |||
/.rbenv-version |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,7 @@ | |||
AUTHORS | |||
|
|||
British Broadcasting Corporation | |||
-------------------------------- | |||
|
|||
- Chris Lowis <chris.lowis at bbc.co.uk> | |||
|
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,3 @@ | |||
source :rubygems | |||
|
|||
gemspec |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,27 @@ | |||
= On This Day | |||
|
|||
A simple ruby parser for Wikipedia's "On this day" box on the | |||
(english-language) homepage. | |||
|
|||
== Usage | |||
|
|||
Install the gem | |||
|
|||
gem install onthisday | |||
|
|||
Then fetch the current news items | |||
|
|||
@onthisday = OnThisDay::Parser.new | |||
items = @onthisday.items | |||
|
|||
Items have text | |||
|
|||
items.first.text #=> "French Revolution: Meeting on a tennis court near the Palace of Versailles, members of France's Third Estate took the Tennis Court Oath, pledging not to separate until a new constitution was established." | |||
|
|||
A year | |||
|
|||
items.first.year #=> 1789 | |||
|
|||
And associated topics (other related wikipedia pages) | |||
|
|||
items.first.topics #=> ['French_Revolution', 'Palace_of_Versailles', 'Estates_of_the_realm', 'Tennis_Court_Oath', 'Constitution'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,79 @@ | |||
require "rubygems" | |||
require "rubygems/package_task" | |||
require "rdoc/task" | |||
|
|||
require "rake/testtask" | |||
Rake::TestTask.new do |t| | |||
t.libs << "test" | |||
t.test_files = FileList["test/**/*_test.rb"] | |||
t.verbose = true | |||
end | |||
|
|||
task :default => ["test"] | |||
|
|||
# This builds the actual gem. For details of what all these options | |||
# mean, and other ones you can add, check the documentation here: | |||
# | |||
# http://rubygems.org/read/chapter/20 | |||
# | |||
spec = Gem::Specification.new do |s| | |||
|
|||
# Change these as appropriate | |||
s.name = "onthisday" | |||
s.version = "0.1.0" | |||
s.summary = "A little parser for Wikipedia's 'On This Day' content block" | |||
s.author = "Chris Lowis" | |||
s.email = "chris.lowis@gmail.com" | |||
s.homepage = "http://github.com/bbcrd/onthisday" | |||
|
|||
s.has_rdoc = true | |||
|
|||
# Add any extra files to include in the gem (like your README) | |||
s.files = %w(Gemfile README.rdoc) + Dir.glob("{test,lib}/**/*") | |||
s.require_paths = ["lib"] | |||
|
|||
# If you want to depend on other gems, add them here, along with any | |||
# relevant versions | |||
s.add_dependency("nokogiri") | |||
s.add_dependency("rest-client") | |||
|
|||
# If your tests use any gems, include them here | |||
s.add_development_dependency("webmock") | |||
end | |||
|
|||
# This task actually builds the gem. We also regenerate a static | |||
# .gemspec file, which is useful if something (i.e. GitHub) will | |||
# be automatically building a gem for this project. If you're not | |||
# using GitHub, edit as appropriate. | |||
# | |||
# To publish your gem online, install the 'gemcutter' gem; Read more | |||
# about that here: http://gemcutter.org/pages/gem_docs | |||
Gem::PackageTask.new(spec) do |pkg| | |||
pkg.gem_spec = spec | |||
end | |||
|
|||
desc "Build the gemspec file #{spec.name}.gemspec" | |||
task :gemspec do | |||
file = File.dirname(__FILE__) + "/#{spec.name}.gemspec" | |||
File.open(file, "w") {|f| f << spec.to_ruby } | |||
end | |||
|
|||
# If you don't want to generate the .gemspec file, just remove this line. Reasons | |||
# why you might want to generate a gemspec: | |||
# - using bundler with a git source | |||
# - building the gem without rake (i.e. gem build blah.gemspec) | |||
# - maybe others? | |||
task :package => :gemspec | |||
|
|||
# Generate documentation | |||
RDoc::Task.new do |rd| | |||
rd.main = "README.rdoc" | |||
rd.rdoc_files.include("README.rdoc", "lib/**/*.rb") | |||
rd.rdoc_files.include("lib/**/*.rb") | |||
rd.rdoc_dir = "rdoc" | |||
end | |||
|
|||
desc 'Clear out RDoc and generated packages' | |||
task :clean => [:clobber_rdoc, :clobber_package] do | |||
rm "#{spec.name}.gemspec" | |||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,79 @@ | |||
# encoding: utf-8 | |||
|
|||
require 'rest_client' | |||
require 'nokogiri' | |||
|
|||
module OnThisDay | |||
class Item | |||
def initialize(element) | |||
@element = element | |||
@year = nil | |||
remove_noprint_elements! | |||
set_and_remove_year! | |||
end | |||
|
|||
# Remove any child nodes with class "nopront". This removes the | |||
# boilerplate Wikinews, Obituries etc. | |||
def remove_noprint_elements! | |||
@element.xpath('//*[starts-with(@class,"noprint")]').each do |node| | |||
node.children.remove | |||
end | |||
end | |||
|
|||
def year | |||
@year.to_i | |||
end | |||
|
|||
def set_and_remove_year! | |||
@element.xpath('./a').each do |node| | |||
title = node['title'] | |||
|
|||
# if the title of the link looks like a year, e.g. "1879", set | |||
# the year of this item and remove the node | |||
if title.match /\A\d{4,4}\z/ | |||
@year = title | |||
node.remove | |||
end | |||
end | |||
end | |||
|
|||
def text | |||
@element.inner_text.gsub(' – ','') | |||
end | |||
|
|||
def html | |||
@element.inner_html.gsub(' – ','') | |||
end | |||
|
|||
# Rescursively search for all a elements in this element and | |||
# return their value (removing /wiki/) | |||
def topics | |||
@element.xpath('.//a').map do |a| | |||
a.attr('href').gsub('/wiki/','') | |||
end | |||
end | |||
end | |||
|
|||
class Parser | |||
def initialize | |||
end | |||
|
|||
def items | |||
elements = doc.xpath("//div[@id='mp-otd']/ul/li") | |||
elements.map {|e| Item.new(e)} | |||
end | |||
|
|||
def doc | |||
Nokogiri::HTML(content) | |||
end | |||
|
|||
def wikipedia_url | |||
"http://en.wikipedia.org/wiki/Main_Page" | |||
end | |||
|
|||
def content | |||
RestClient.proxy = ENV['http_proxy'] | |||
@content ||= RestClient.get wikipedia_url | |||
end | |||
end | |||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Original file line | Diff line number | Diff line change |
---|---|---|---|
@@ -0,0 +1,34 @@ | |||
# -*- encoding: utf-8 -*- | |||
|
|||
Gem::Specification.new do |s| | |||
s.name = "onthisday" | |||
s.version = "0.1.4" | |||
|
|||
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= | |||
s.authors = ["Chris Lowis"] | |||
s.date = "2012-06-20" | |||
s.email = "chris.lowis@gmail.com" | |||
s.files = ["Gemfile", "README.rdoc", "test/fixtures", "test/fixtures/main_page_20120620.html", "test/onthisday_test.rb", "lib/onthisday.rb"] | |||
s.homepage = "http://github.com/bbcrd/onthisday" | |||
s.require_paths = ["lib"] | |||
s.rubygems_version = "1.8.24" | |||
s.summary = "A little parser for Wikipedia's 'In the News' content" | |||
|
|||
if s.respond_to? :specification_version then | |||
s.specification_version = 3 | |||
|
|||
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then | |||
s.add_runtime_dependency(%q<nokogiri>, [">= 0"]) | |||
s.add_runtime_dependency(%q<rest-client>, [">= 0"]) | |||
s.add_development_dependency(%q<webmock>, [">= 0"]) | |||
else | |||
s.add_dependency(%q<nokogiri>, [">= 0"]) | |||
s.add_dependency(%q<rest-client>, [">= 0"]) | |||
s.add_dependency(%q<webmock>, [">= 0"]) | |||
end | |||
else | |||
s.add_dependency(%q<nokogiri>, [">= 0"]) | |||
s.add_dependency(%q<rest-client>, [">= 0"]) | |||
s.add_dependency(%q<webmock>, [">= 0"]) | |||
end | |||
end |
Oops, something went wrong.