README updates; doctype and preprocessing fixes

xwmx · Jul 7, 2009 · 8fdf113 · 8fdf113
1 parent 555d017
commit 8fdf113
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 15 deletions.
diff --git a/README.rdoc b/README.rdoc
@@ -27,7 +27,7 @@ broken too bad) be readable but will be lacking any metadata or TOC.
 
 Few examples:
 
-* Project Gutenberg's THE ADVENTURES OF SHERLOCK HOLMES (with proper table of contents)
+* Project Gutenberg's The Adventures Of Sherlock Holmes (with proper table of contents)
 
     repub -x 'title:div[@class='book']//h1' \
       -x 'toc://table' \
@@ -69,14 +69,22 @@ For example, if you later decide to regenerate Git Manual ePub without TOC at th
 
 Few more examples:
 
+* Open Packaging Format (OPF) 2.0 (one of the ePub standards, in ePub)
+
+    repub -x 'title://p[@class="Title"]' \
+      -x 'toc://div[@class="TOC"]' \
+      -x 'toc_item:.//p' \
+      -x 'toc_section:.//div[@class="TOCSection"]' \
+      http://www.idpf.org/2007/opf/OPF_2.0_final_spec.html
+
 * GNU Wget Manual
 
     repub -m 'creator:gnu.org' \
       -x 'title://h1' -x 'toc://div[@class="contents"]/ul' -x 'toc_item:li' -x 'toc_section:ul' \
       -X '//div[@class="contents"]' \
       http://www.gnu.org/software/wget/manual/wget.html
 
-* Project Gutenberg's ALICE'S ADVENTURES IN WONDERLAND
+* And finally, the "Hello World" of e-books, Alice's Adventures In Wonderland
 
     repub -x 'title:body/h1' -x 'toc://table' -x 'toc_item://tr' -X '//pre' -X '//hr' -X '//body/h4' \
       http://www.gutenberg.org/files/11/11-h/11-h.htm
@@ -150,7 +158,7 @@ Bugs: probably. If you find any, please report them to dg at invisiblellama dot
 
 == INSTALL:
 
-    gem install repub
+    sudo gem install repub
 
 == LICENSE:
 

diff --git a/lib/repub/app/builder.rb b/lib/repub/app/builder.rb
@@ -138,12 +138,21 @@ def postprocess_file(asset)
             source.gsub!(Regexp.new(pattern), replacement)
           end if @options[:rx]
 
-          # Add doctype if missing
-          if source !~ /\s*<!DOCTYPE/
-            log.debug "-- Adding missing doctype"
-            source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
+          # Remove xml preamble if any
+          preamble_rx = /^\s*<\?xml\s+[^>]+>\s*/mi
+          if source =~ preamble_rx
+            log.debug "-- Removing xml preamble"
+            source.sub!(preamble_rx, '')
           end
-
+
+          # Replace doctype
+          doctype_rx = /^\s*<!DOCTYPE\s+[^>]+>\s*/mi
+          if source =~ doctype_rx
+            source.sub!(doctype_rx, '')
+          end
+          log.debug "-- Replacing doctype"
+          source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + source
+
           # Save processed file
           File.open(asset, 'w') do |f|
             f.write(source)
@@ -152,7 +161,7 @@ def postprocess_file(asset)
 
         def postprocess_doc(asset)
           doc = Nokogiri::HTML.parse(IO.read(asset), nil, 'UTF-8')
-
+          
           # Set Content-Type charset to UTF-8
           doc.xpath('//head/meta[@http-equiv="Content-Type"]').each do |el|
             el['content'] = 'text/html; charset=utf-8'
@@ -203,6 +212,13 @@ def postprocess_doc(asset)
             log.info "Removing elements \"#{selector}\""
             doc.search(selector).remove
           end if @options[:remove]
+
+          # XXX
+          # doc.xpath('//body/a').each do |a|
+          #   wrapper = Nokogiri::XML::Node.new('p', doc)
+          #   a.add_next_sibling(wrapper)
+          #   wrapper << a
+          # end
 
           # Save processed doc
           File.open(asset, 'w') do |f|

diff --git a/lib/repub/app/fetcher.rb b/lib/repub/app/fetcher.rb
@@ -71,22 +71,54 @@ def fetch
 
         private
 
+        # HACK HACK HACK
+        # ADE seems to have problems following TOC in content files with .htm extension
+        # Renaming these files to .html and fix references inside them
+        #
         def fix_filenames(cache)
-          # TODO: fix non-alphanum characters in doc filenames
+          # # TODO: fix non-alphanum characters in doc filenames
+          # documents = []
+          # cache.assets[:documents].each do |file_name|
+          #   if file_name =~ /\.htm$/i
+          #     proper_name = file_name.gsub($&, '.html')
+          #     FileUtils.mv(file_name, proper_name)
+          #     s = IO.read(proper_name)
+          #     raise FetcherException, "empty document" unless s
+          #     s.gsub!(file_name, proper_name)
+          #     File.open(proper_name, 'w') { |f| f.write(s) }
+          #     documents << proper_name
+          #   else
+          #     documents << file_name
+          #   end
+          # end
+          # cache.assets[:documents] = documents
+
+          # XXX
+          cache.assets[:documents].each do |file_name|
+            s = IO.read(file_name)
+            m = s.scan(/\s+(?:id|name)\s*?=\s*?['"](\d+[^'"]*)['"]/im)
+            unless m.empty?
+              m.each do |i|
+                s.gsub!(i[0], "a#{i[0]}")
+              end
+              File.open(file_name, 'w') { |f| f.write(s) }
+            end
+          end
+
         end
 
         def fix_encoding(cache, encoding = nil)
-          cache.assets[:documents].each do |doc|
+          cache.assets[:documents].each do |file_name|
             unless encoding
-              log.info "Detecting encoding for #{doc}"
-              s = IO.read(doc)
+              log.info "Detecting encoding for #{file_name}"
+              s = IO.read(file_name)
               raise FetcherException, "empty document" unless s
               encoding = UniversalDetector.chardet(s)['encoding']
             end
             if encoding.downcase != 'utf-8'
               log.info "Source encoding appears to be #{encoding}, converting to UTF-8"
-              s = Iconv.conv('utf-8', encoding, IO.read(doc))
-              File.open(doc, 'w') { |f| f.write(s) }
+              s = Iconv.conv('utf-8', encoding, s)
+              File.open(file_name, 'w') { |f| f.write(s) }
             end
           end
         end