Merge pull request #9 from ageitgey/fix-wikipedia

Fix #8 - text getting dropped in wikipedia articles
ageitgey · Jul 22, 2014 · 27e5f96 · 27e5f96
2 parents 8138584 + 109bab9
commit 27e5f96
Show file tree

Hide file tree

Showing 8 changed files with 1,785 additions and 15 deletions.
diff --git a/fixtures/test_github1.html b/fixtures/test_github1.html
diff --git a/fixtures/test_wikipedia1.html b/fixtures/test_wikipedia1.html
diff --git a/lib/extractor.js b/lib/extractor.js
diff --git a/lib/formatter.js b/lib/formatter.js
diff --git a/package.json b/package.json
@@ -32,7 +32,8 @@
   "dependencies": {
     "cheerio": "~0.16.0",
     "optimist": "~0.6.1",
-    "lodash": "~2.4.1"
+    "lodash": "~2.4.1",
+    "xregexp": "~2.0.0"
   },
   "devDependencies": {
     "coffee-script-redux": "2.0.0-beta7",

diff --git a/src/extractor.coffee b/src/extractor.coffee
@@ -452,7 +452,8 @@ postCleanup = (doc, targetNode, lang) ->
   node.children().each () ->
     e = doc(this)
     eTag = e[0].name
-    if eTag != 'p'
+    if eTag not in ['p', 'a']
       if isHighlinkDensity(doc, e) || isTableAndNoParaExist(doc, e) || !isNodescoreThresholdMet(doc, node, e)
         doc(e).remove()
+
   return node
diff --git a/src/formatter.coffee b/src/formatter.coffee
@@ -1,5 +1,6 @@
 stopwords = require("./stopwords")
 _ = require("lodash")
+{XRegExp} = require('xregexp')
 
 module.exports = formatter = (doc, topNode, language) ->
   removeNegativescoresNodes(doc, topNode)
@@ -20,22 +21,55 @@ replaceWithText = (doc, topNode) ->
   nodes.each () ->
     doc(this).replaceWith(doc(this).text())
 
+cleanParagraphText = (rawText) ->
+  txt = rawText.trim()
+  txt.replace(/[\s\t]+/g, ' ')
+  txt
+
 # Turn an html element (and children) into nicely formatted text
 convertToText = (doc, topNode) ->
   txts = []
-  nodes = topNode.children()
+  nodes = topNode.contents()
+
+  # To hold any text fragments that end up in text nodes outside of
+  # html elements
+  hangingText = ""
+
   nodes.each () ->
     node = doc(this)
+    nodeType = node[0].type
+
+    # Handle top level text nodes by adding them to a running list
+    # and then treating all the hanging nodes as one paragraph tag
+    if nodeType == "text"
+      hangingText += node.text()
+      # Same as 'continue'
+      return true
+
+    # If we hit a real node and still have extra acculated text,
+    # pop it out as if it was a paragraph tag
+    if hangingText.length > 0
+      txt = cleanParagraphText(hangingText)
+      txts = txts.concat(txt.split(/\r?\n/))
+      hangingText = ""
+
+    txt = cleanParagraphText(node.text())
+    txts = txts.concat(txt.split(/\r?\n/))
 
-    txt = node.text().trim()
-    txt.replace(/[\s\t]+/g, ' ')
+  # Catch any left-over hanging text nodes
+  if hangingText.length > 0
+    txt = cleanParagraphText(hangingText)
     txts = txts.concat(txt.split(/\r?\n/))
 
   txts = _.map txts, (txt) ->
     txt.trim()
 
+  # Make sure each text chunk includes at least one text character or number.
+  # This supports multiple languages words using XRegExp to generate the
+  # regex that matches wranges of unicode characters used in words.
+  regex = XRegExp('[\\p{Number}\\p{Letter}]')
   txts = _.filter txts, (txt) ->
-    (/[a-zA-Z0-9]/.test(txt))
+    regex.test(txt)
 
   txts.join('\n\n')
 
@@ -69,7 +103,6 @@ removeFewwordsParagraphs = (doc, topNode, language) ->
     stopWords = stopwords(text, language)
     if (tag != 'br' || text != '\\r') && stopWords.stopwordCount < 3 && el.find("object").length == 0 && el.find("embed").length == 0
       doc(el).remove()
-
     else
       trimmed = text.trim()
       if trimmed[0] == "(" && trimmed[trimmed.length - 1] == ")"

diff --git a/test/formatter.coffee b/test/formatter.coffee
@@ -13,3 +13,12 @@ suite 'Formatter', ->
 
     formatter(origDoc, origDoc('body'), 'en')
     eq origDoc("a").length, 0
+
+  test 'doesn\'t drop text nodes accidentally', ->
+    html = fs.readFileSync("./fixtures/test_wikipedia1.html").toString()
+    doc = cheerio.load(html)
+
+    formatter(doc, doc('body'), 'en')
+    html = doc.html()
+    # This text was getting dropped by the formatter
+    ok /is a thirteen episode anime series directed by Akitaro Daichi and written by Hideyuki Kurata/.test(html)