Skip to content

Commit

Permalink
use Bundler for gem; require guess_html_encoding for Ruby 1.9.2
Browse files Browse the repository at this point in the history
  • Loading branch information
cantino committed Oct 4, 2011
1 parent 15cb43c commit 975ce96
Show file tree
Hide file tree
Showing 14 changed files with 86 additions and 147 deletions.
28 changes: 6 additions & 22 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,23 +1,7 @@
pkg/*
=======
## MAC OS
.DS_Store

## TEXTMATE
*.tmproj
tmtags

## EMACS
*~
\#*
.\#*

## VIM
*.swp

## PROJECT::GENERAL
coverage
rdoc
pkg

## PROJECT::SPECIFIC
.gem
.bundle
Gemfile.lock
pkg/*
.idea
.rvmrc
4 changes: 4 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source "http://rubygems.org"

# Specify your gem's dependencies in ruby-readability.gemspec
gemspec
17 changes: 16 additions & 1 deletion README
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,28 @@ Example:
source = open('http://lab.arc90.com/experiments/readability/').read
puts Readability::Document.new(source).content

There is also a command-line tool for testing readability in bin/readability.
Options:

You may provide additions options to Readability::Document.new, including:

:tags - the base whitelist of tags to sanitize, defaults to %w[div p]
:remove_empty_nodes - remove <p> tags that have no text content; this will also remove p tags that contain only images
:attributes - whitelist of allowed attributes
:debug - provide debugging output, defaults false
:encoding - if this page is of a known encoding, you can specify it; if left unspecified, the encoding will be guessed (only in Ruby 1.9.2)
:html_headers - in Ruby 1.9.2 these will be passed to the guess_html_encoding gem to aid with encoding guessing

Readability comes with a command-line tool for experimentation in bin/readability.

Usage: readability [options] URL
-d, --debug Show debug output
-i, --images Keep images and links
-h, --help Show this message

Potential issues:

* If you're on a Mac and are getting segmentation faults, see this discussion https://github.com/tenderlove/nokogiri/issues/404 and consider updating your version of libxml2.

===

This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0
Expand Down
45 changes: 3 additions & 42 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -1,45 +1,6 @@
require 'rubygems'
require 'rake'
require "bundler/gem_tasks"
require 'rspec/core/rake_task'

begin
require 'jeweler'
Jeweler::Tasks.new do |gem|
gem.name = "ruby-readability"
gem.summary = %Q{Port of arc90's readability project to ruby}
gem.description = %Q{Port of arc90's readability project to ruby}
gem.email = "andrew@iterationlabs.com"
gem.homepage = "http://github.com/iterationlabs/ruby-readability"
gem.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
gem.add_development_dependency "rspec", ">= 1.2.9"
gem.add_dependency 'nokogiri', '>= 1.4.2'
end
Jeweler::GemcutterTasks.new
rescue LoadError
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
end

require 'spec/rake/spectask'
Spec::Rake::SpecTask.new(:spec) do |spec|
spec.libs << 'lib' << 'spec'
spec.spec_files = FileList['spec/**/*_spec.rb']
end

Spec::Rake::SpecTask.new(:rcov) do |spec|
spec.libs << 'lib' << 'spec'
spec.pattern = 'spec/**/*_spec.rb'
spec.rcov = true
end

task :spec => :check_dependencies
RSpec::Core::RakeTask.new(:spec)

task :default => :spec

require 'rake/rdoctask'
Rake::RDocTask.new do |rdoc|
version = File.exist?('VERSION') ? File.read('VERSION') : ""

rdoc.rdoc_dir = 'rdoc'
rdoc.title = "ruby-readability #{version}"
rdoc.rdoc_files.include('README*')
rdoc.rdoc_files.include('lib/**/*.rb')
end
1 change: 0 additions & 1 deletion VERSION

This file was deleted.

16 changes: 11 additions & 5 deletions lib/readability.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
require 'rubygems'
require 'nokogiri'
require 'guess_html_encoding'

module Readability
class Document
Expand All @@ -9,24 +10,29 @@ class Document
:remove_unlikely_candidates => true,
:weight_classes => true,
:clean_conditionally => true,
:remove_empty_nodes => true,
:encoding => 'UTF-8'
:remove_empty_nodes => true
}.freeze

attr_accessor :options, :html

def initialize(input, options = {})
@input = input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@options = DEFAULT_OPTIONS.merge(options)
@input = input

if RUBY_VERSION == "1.9.2" && !@options[:encoding]
@input = GuessHtmlEncoding.encode(@input, @options[:html_headers]) unless @options[:do_not_guess_encoding]
@options[:encoding] = @input.encoding.to_s
end

@input = @input.gsub(REGEXES[:replaceBrsRe], '</p><p>').gsub(REGEXES[:replaceFontsRe], '<\1span>')
@remove_unlikely_candidates = @options[:remove_unlikely_candidates]
@weight_classes = @options[:weight_classes]
@clean_conditionally = @options[:clean_conditionally]
@encoding = @options[:encoding]
make_html
end

def make_html
@html = Nokogiri::HTML(@input, nil, @encoding)
@html = Nokogiri::HTML(@input, nil, @options[:encoding])
end

REGEXES = {
Expand Down
1 change: 1 addition & 0 deletions lib/ruby-readability.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
require 'readability'
73 changes: 16 additions & 57 deletions ruby-readability.gemspec
Original file line number Diff line number Diff line change
@@ -1,64 +1,23 @@
# Generated by jeweler
# DO NOT EDIT THIS FILE DIRECTLY
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
# -*- encoding: utf-8 -*-
$:.push File.expand_path("../lib", __FILE__)

Gem::Specification.new do |s|
s.name = %q{ruby-readability}
s.version = "0.2.4"

s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
s.date = %q{2011-06-09}
s.default_executable = %q{readability}
s.name = "ruby-readability"
s.version = '0.2.5'
s.authors = ["Andrew Cantino", "starrhorne", "libc", "Kyle Maxwell"]
s.email = ["andrew@iterationlabs.com"]
s.homepage = "http://github.com/iterationlabs/ruby-readability"
s.summary = %q{Port of arc90's readability project to ruby}
s.description = %q{Port of arc90's readability project to ruby}
s.email = %q{andrew@iterationlabs.com}
s.executables = ["readability"]
s.extra_rdoc_files = [
"README"
]
s.files = [
".document",
"README",
"Rakefile",
"VERSION",
"bin/readability",
"lib/readability.rb",
"ruby-readability.gemspec",
"spec/fixtures/cant_read.html",
"spec/fixtures/sample.html",
"spec/fixtures/samples/blogpost_with_links-fragments.rb",
"spec/fixtures/samples/blogpost_with_links.html",
"spec/fixtures/samples/channel4-1-fragments.rb",
"spec/fixtures/samples/channel4-1.html",
"spec/fixtures/samples/foxnews-india1-fragments.rb",
"spec/fixtures/samples/foxnews-india1.html",
"spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb",
"spec/fixtures/samples/globemail-ottawa-cuts.html",
"spec/fixtures/should_not_truncate.txt",
"spec/readability_spec.rb",
"spec/spec.opts",
"spec/spec_helper.rb"
]
s.homepage = %q{http://github.com/iterationlabs/ruby-readability}
s.require_paths = ["lib"]
s.rubygems_version = %q{1.3.7}
s.summary = %q{Port of arc90's readability project to ruby}

if s.respond_to? :specification_version then
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
s.specification_version = 3
s.rubyforge_project = "ruby-readability"

if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
s.add_runtime_dependency(%q<nokogiri>, [">= 1.4.2"])
else
s.add_dependency(%q<rspec>, [">= 1.2.9"])
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
end
else
s.add_dependency(%q<rspec>, [">= 1.2.9"])
s.add_dependency(%q<nokogiri>, [">= 1.4.2"])
end
end
s.files = `git ls-files`.split("\n")
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
s.require_paths = ["lib"]

s.add_development_dependency "rspec", ">= 2.6"
s.add_dependency 'nokogiri', '>= 1.4.2'
s.add_dependency 'guess_html_encoding', '> 0.0.0'
end
1 change: 1 addition & 0 deletions spec/fixtures/samples/blogpost_with_links-fragments.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# encoding: UTF-8
# This sample originally from http://softarhive.net

$required_fragments = [
Expand Down
2 changes: 1 addition & 1 deletion spec/fixtures/samples/channel4-1-fragments.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

# encoding: UTF-8
# This sample originally from http://www.channel4.com/news/articles/world/judge+rules+briton+can+be+forcefed/3578372

$required_fragments = [
Expand Down
2 changes: 1 addition & 1 deletion spec/fixtures/samples/foxnews-india1-fragments.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

# encoding: UTF-8
# This sample originally from http://www.foxnews.com/world/2010/05/14/police-killed-bus-touches-high-voltage-wire-central-india/?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed:+foxnews/latest+(Text+-+Latest+Headlines)

$required_fragments = [
Expand Down
2 changes: 1 addition & 1 deletion spec/fixtures/samples/globemail-ottawa-cuts-fragments.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

# encoding: UTF-8
# Originally from http://www.theglobeandmail.com/news/national/ottawa-cuts-already-vacant-positions/article1494400/

$required_fragments = [
Expand Down
30 changes: 20 additions & 10 deletions spec/readability_spec.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
require File.expand_path(File.join(File.dirname(__FILE__), "spec_helper"))
require 'spec_helper'

describe Readability do
before do
Expand Down Expand Up @@ -115,11 +115,9 @@
b[:content_score] <=> a[:content_score]
}.first[:elem][:id].should == "body"
end
end

describe "score_paragraphs" do
context "when two consequent br tags are used instead of p" do
before :each do
it "should assign the higher score to the first paragraph in this particular example" do
@doc = Readability::Document.new(<<-HTML)
<html>
<head>
Expand All @@ -140,9 +138,6 @@
</html>
HTML
@candidates = @doc.score_paragraphs(0)
end

it "should assign the higher score to the first paragraph in this particular example" do
@candidates.values.sort_by { |a| -a[:content_score] }.first[:elem][:id].should == 'post1'
end
end
Expand Down Expand Up @@ -204,14 +199,13 @@
end

it "should output expected fragments of text" do

checks = 0
@samples.each do |sample|
html = File.read(File.dirname(__FILE__) + "/fixtures/samples/#{sample}.html")
doc = Readability::Document.new(html).content

load "fixtures/samples/#{sample}-fragments.rb"
puts "testing #{sample}..."
#puts "testing #{sample}..."

$required_fragments.each do |required_text|
doc.should include(required_text)
Expand All @@ -223,7 +217,23 @@
checks += 1
end
end
puts "Performed #{checks} checks."
#puts "Performed #{checks} checks."
end
end

describe "encoding guessing" do
context "with ruby 1.9.2" do
it "should correctly guess and enforce HTML encoding" do

end

it "should allow encoding guessing to be skipped" do

end

it "should allow encoding guessing to be overridden" do

end
end
end
end
11 changes: 5 additions & 6 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
$LOAD_PATH.unshift(File.dirname(__FILE__))
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
#$LOAD_PATH.unshift(File.dirname(__FILE__))
#$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
require 'rubygems'
require 'readability'
require 'spec'
require 'spec/autorun'

Spec::Runner.configure do |config|
#require 'spec'
#require 'spec/autorun'

RSpec.configure do |c|
end

0 comments on commit 975ce96

Please sign in to comment.