Skip to content

Commit

Permalink
Merge pull request #9 from ageitgey/fix-wikipedia
Browse files Browse the repository at this point in the history
Fix #8 - text getting dropped in wikipedia articles
  • Loading branch information
ageitgey committed Jul 22, 2014
2 parents 8138584 + 109bab9 commit 27e5f96
Show file tree
Hide file tree
Showing 8 changed files with 1,785 additions and 15 deletions.
1,069 changes: 1,069 additions & 0 deletions fixtures/test_github1.html

Large diffs are not rendered by default.

635 changes: 635 additions & 0 deletions fixtures/test_wikipedia1.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lib/extractor.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 29 additions & 7 deletions lib/formatter.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
"dependencies": {
"cheerio": "~0.16.0",
"optimist": "~0.6.1",
"lodash": "~2.4.1"
"lodash": "~2.4.1",
"xregexp": "~2.0.0"
},
"devDependencies": {
"coffee-script-redux": "2.0.0-beta7",
Expand Down
3 changes: 2 additions & 1 deletion src/extractor.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,8 @@ postCleanup = (doc, targetNode, lang) ->
node.children().each () ->
e = doc(this)
eTag = e[0].name
if eTag != 'p'
if eTag not in ['p', 'a']
if isHighlinkDensity(doc, e) || isTableAndNoParaExist(doc, e) || !isNodescoreThresholdMet(doc, node, e)
doc(e).remove()

return node
43 changes: 38 additions & 5 deletions src/formatter.coffee
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
stopwords = require("./stopwords")
_ = require("lodash")
{XRegExp} = require('xregexp')

module.exports = formatter = (doc, topNode, language) ->
removeNegativescoresNodes(doc, topNode)
Expand All @@ -20,22 +21,55 @@ replaceWithText = (doc, topNode) ->
nodes.each () ->
doc(this).replaceWith(doc(this).text())

cleanParagraphText = (rawText) ->
txt = rawText.trim()
txt.replace(/[\s\t]+/g, ' ')
txt

# Turn an html element (and children) into nicely formatted text
convertToText = (doc, topNode) ->
txts = []
nodes = topNode.children()
nodes = topNode.contents()

# To hold any text fragments that end up in text nodes outside of
# html elements
hangingText = ""

nodes.each () ->
node = doc(this)
nodeType = node[0].type

# Handle top level text nodes by adding them to a running list
# and then treating all the hanging nodes as one paragraph tag
if nodeType == "text"
hangingText += node.text()
# Same as 'continue'
return true

# If we hit a real node and still have extra acculated text,
# pop it out as if it was a paragraph tag
if hangingText.length > 0
txt = cleanParagraphText(hangingText)
txts = txts.concat(txt.split(/\r?\n/))
hangingText = ""

txt = cleanParagraphText(node.text())
txts = txts.concat(txt.split(/\r?\n/))

txt = node.text().trim()
txt.replace(/[\s\t]+/g, ' ')
# Catch any left-over hanging text nodes
if hangingText.length > 0
txt = cleanParagraphText(hangingText)
txts = txts.concat(txt.split(/\r?\n/))

txts = _.map txts, (txt) ->
txt.trim()

# Make sure each text chunk includes at least one text character or number.
# This supports multiple languages words using XRegExp to generate the
# regex that matches wranges of unicode characters used in words.
regex = XRegExp('[\\p{Number}\\p{Letter}]')
txts = _.filter txts, (txt) ->
(/[a-zA-Z0-9]/.test(txt))
regex.test(txt)

txts.join('\n\n')

Expand Down Expand Up @@ -69,7 +103,6 @@ removeFewwordsParagraphs = (doc, topNode, language) ->
stopWords = stopwords(text, language)
if (tag != 'br' || text != '\\r') && stopWords.stopwordCount < 3 && el.find("object").length == 0 && el.find("embed").length == 0
doc(el).remove()

else
trimmed = text.trim()
if trimmed[0] == "(" && trimmed[trimmed.length - 1] == ")"
Expand Down
9 changes: 9 additions & 0 deletions test/formatter.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,12 @@ suite 'Formatter', ->

formatter(origDoc, origDoc('body'), 'en')
eq origDoc("a").length, 0

test 'doesn\'t drop text nodes accidentally', ->
html = fs.readFileSync("./fixtures/test_wikipedia1.html").toString()
doc = cheerio.load(html)

formatter(doc, doc('body'), 'en')
html = doc.html()
# This text was getting dropped by the formatter
ok /is a thirteen episode anime series directed by Akitaro Daichi and written by Hideyuki Kurata/.test(html)

0 comments on commit 27e5f96

Please sign in to comment.