Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #8 - text getting dropped in wikipedia articles #9

Merged
merged 1 commit into from
Jul 22, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,069 changes: 1,069 additions & 0 deletions fixtures/test_github1.html

Large diffs are not rendered by default.

635 changes: 635 additions & 0 deletions fixtures/test_wikipedia1.html

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lib/extractor.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 29 additions & 7 deletions lib/formatter.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
"dependencies": {
"cheerio": "~0.16.0",
"optimist": "~0.6.1",
"lodash": "~2.4.1"
"lodash": "~2.4.1",
"xregexp": "~2.0.0"
},
"devDependencies": {
"coffee-script-redux": "2.0.0-beta7",
Expand Down
3 changes: 2 additions & 1 deletion src/extractor.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,8 @@ postCleanup = (doc, targetNode, lang) ->
node.children().each () ->
e = doc(this)
eTag = e[0].name
if eTag != 'p'
if eTag not in ['p', 'a']
if isHighlinkDensity(doc, e) || isTableAndNoParaExist(doc, e) || !isNodescoreThresholdMet(doc, node, e)
doc(e).remove()

return node
43 changes: 38 additions & 5 deletions src/formatter.coffee
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
stopwords = require("./stopwords")
_ = require("lodash")
{XRegExp} = require('xregexp')

module.exports = formatter = (doc, topNode, language) ->
removeNegativescoresNodes(doc, topNode)
Expand All @@ -20,22 +21,55 @@ replaceWithText = (doc, topNode) ->
nodes.each () ->
doc(this).replaceWith(doc(this).text())

cleanParagraphText = (rawText) ->
txt = rawText.trim()
txt.replace(/[\s\t]+/g, ' ')
txt

# Turn an html element (and children) into nicely formatted text
convertToText = (doc, topNode) ->
txts = []
nodes = topNode.children()
nodes = topNode.contents()

# To hold any text fragments that end up in text nodes outside of
# html elements
hangingText = ""

nodes.each () ->
node = doc(this)
nodeType = node[0].type

# Handle top level text nodes by adding them to a running list
# and then treating all the hanging nodes as one paragraph tag
if nodeType == "text"
hangingText += node.text()
# Same as 'continue'
return true

# If we hit a real node and still have extra acculated text,
# pop it out as if it was a paragraph tag
if hangingText.length > 0
txt = cleanParagraphText(hangingText)
txts = txts.concat(txt.split(/\r?\n/))
hangingText = ""

txt = cleanParagraphText(node.text())
txts = txts.concat(txt.split(/\r?\n/))

txt = node.text().trim()
txt.replace(/[\s\t]+/g, ' ')
# Catch any left-over hanging text nodes
if hangingText.length > 0
txt = cleanParagraphText(hangingText)
txts = txts.concat(txt.split(/\r?\n/))

txts = _.map txts, (txt) ->
txt.trim()

# Make sure each text chunk includes at least one text character or number.
# This supports multiple languages words using XRegExp to generate the
# regex that matches wranges of unicode characters used in words.
regex = XRegExp('[\\p{Number}\\p{Letter}]')
txts = _.filter txts, (txt) ->
(/[a-zA-Z0-9]/.test(txt))
regex.test(txt)

txts.join('\n\n')

Expand Down Expand Up @@ -69,7 +103,6 @@ removeFewwordsParagraphs = (doc, topNode, language) ->
stopWords = stopwords(text, language)
if (tag != 'br' || text != '\\r') && stopWords.stopwordCount < 3 && el.find("object").length == 0 && el.find("embed").length == 0
doc(el).remove()

else
trimmed = text.trim()
if trimmed[0] == "(" && trimmed[trimmed.length - 1] == ")"
Expand Down
9 changes: 9 additions & 0 deletions test/formatter.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,12 @@ suite 'Formatter', ->

formatter(origDoc, origDoc('body'), 'en')
eq origDoc("a").length, 0

test 'doesn\'t drop text nodes accidentally', ->
html = fs.readFileSync("./fixtures/test_wikipedia1.html").toString()
doc = cheerio.load(html)

formatter(doc, doc('body'), 'en')
html = doc.html()
# This text was getting dropped by the formatter
ok /is a thirteen episode anime series directed by Akitaro Daichi and written by Hideyuki Kurata/.test(html)