Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
switched to using cheerio from jQuery.
  • Loading branch information
bcoe committed Jan 9, 2015
1 parent 799abae commit 7f86f65
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 34 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -1,2 +1,3 @@
.DS_Store
.npm_modules
node_modules
6 changes: 3 additions & 3 deletions bin/wikifetch.js
Expand Up @@ -7,14 +7,14 @@ if (!argv.article) {
console.log('usage: wikifetch --article=[article_name]')
} else {
var article = argv.article;

console.log('Fetching article ' + argv.article + '.');

var articleObject = (new WikiFetch()).fetch(article, function(err, articleObject) {
if (err) {
console.log('Could not fetch article: ' + err);
} else {
fs.writeFileSync(article + '.json', JSON.stringify(articleObject));
}
});
}
}
42 changes: 21 additions & 21 deletions lib/wikifetch.js
@@ -1,4 +1,4 @@
var jQuery = require('jquery'),
var cheerio = require('cheerio'),
request = require('request'),
sexy = require('sexy-args');

Expand All @@ -14,7 +14,7 @@ function WikiFetch(params) {

WikiFetch.prototype.fetch = function(articleName, callback) {
var _this = this;

this.loadArticle(articleName, function(err, article) {
if (err) {
callback(err, null);
Expand All @@ -29,21 +29,21 @@ WikiFetch.prototype.fetch = function(articleName, callback) {
};

WikiFetch.prototype.parseTitle = function(article, parsedArticle) {
parsedArticle.title = article.find('#firstHeading').text();
parsedArticle.title = article('#firstHeading').text();
};

WikiFetch.prototype.parseLinks = function(article, parsedArticle) {

parsedArticle.links = {};
article.find('#bodyContent p a').each(function() {
var element = jQuery(this),

article('#bodyContent p a').each(function() {
var element = cheerio(this),
href = element.attr('href'),
entityName = href.replace('/wiki/', '');

// Only extract article links.
if ( href.indexOf('/wiki/') < 0 ) return;

// Create or update the link lookup table.
if ( parsedArticle.links[entityName] ) {
parsedArticle.links[entityName].occurrences++;
Expand All @@ -54,40 +54,40 @@ WikiFetch.prototype.parseLinks = function(article, parsedArticle) {
text: element.text()
};
}

// Replace the element in the page with a reference to the link.
element.replaceWith('[[' + entityName + ']]');
});
};

WikiFetch.prototype.parseSections = function(article, parsedArticle) {
var currentHeadline = parsedArticle.title;

parsedArticle.sections = {};
article.find('#bodyContent p,h2,h3,img').each(function() {
var element = jQuery(this);

article('#bodyContent p,h2,h3,img').each(function() {
var element = cheerio(this);

// Load new headlines as we observe them.
if (element.is('h2') || element.is('h3')) {
currentHeadline = jQuery.trim( element.text() );
currentHeadline = element.text().trim();
return;
}

// Initialize the object for this section.
if (!parsedArticle.sections[currentHeadline]) {
parsedArticle.sections[currentHeadline] = {
text: '',
images: []
};
}

// Grab images from the section don't grab spammy ones.
if (element.is('img') && element.attr('width') > 50) {
parsedArticle.sections[currentHeadline].images.push( element.attr('src').replace('//', 'http://') );
return;
}

parsedArticle.sections[currentHeadline].text += element.text();
});
};
Expand All @@ -101,8 +101,8 @@ WikiFetch.prototype.loadArticle = function(articleName, callback) {
callback(error, null);
return;
}
callback(null, jQuery(body));
callback(null, cheerio.load(body));
});
};

exports.WikiFetch = WikiFetch;
exports.WikiFetch = WikiFetch;
16 changes: 11 additions & 5 deletions package.json
Expand Up @@ -4,14 +4,17 @@
"lib": "./lib",
"bin": "./bin"
},
"scripts": {
"test": "micro-test"
},
"main": "./lib/index.js",
"bin": "./bin/wikifetch.js",
"version": "0.0.1",
"author": "Ben Coe <bencoe@gmail.com>",
"engines": [
"node"
],
"description": "Uses jQuery to return a structured JSON representation of a Wikipedia article.",
"description": "Uses cheerio to return a structured JSON representation of a Wikipedia article.",
"keywords": [
"crawler",
"twitter",
Expand All @@ -22,10 +25,13 @@
"url": "git://github.com/bcoe/wikifetch.git"
},
"dependencies": {
"request": ">=2.9.203",
"jquery": ">=1.6.3",
"sexy-args": ">=1.1.5",
"cheerio": "^0.18.0",
"micro-test": "1.0.0",
"optimist": "0.3.4",
"micro-test": "1.0.0"
"request": ">=2.9.203",
"sexy-args": ">=1.1.5"
},
"devDependencies": {
"micro-test": "^1.0.0"
}
}
10 changes: 5 additions & 5 deletions test/wikifetch-test.js
Expand Up @@ -2,7 +2,7 @@ var equal = require('assert').equal,
WikiFetch = require('../lib').WikiFetch;

exports.tests = {
'title should be parsed from wikipedia article': function(finished, prefix) {
'title should be parsed from wikipedia article': function(finished, prefix) {
var wikiFetch = new WikiFetch();
wikiFetch.fetch('Dog', function(err, articleObject) {
equal('Dog', articleObject.title, prefix + ' title of article was not parsed.');
Expand All @@ -20,23 +20,23 @@ exports.tests = {
'text of article should be split into sections': function(finished, prefix) {
var wikiFetch = new WikiFetch();
wikiFetch.fetch('Dog', function(err, articleObject) {
equal(true, articleObject.sections['Dog'].text.indexOf('domesticated') > 0, prefix + ' sections not parsed.');
equal(true, articleObject.sections['Dog'].text.indexOf('domestic') > 0, prefix + ' sections not parsed.');
equal(true, articleObject.sections['Taxonomy'].text.indexOf('gray wolf') > 0, prefix + ' sections not parsed.');
finished();
});
},
'text contents of article should have links replaced': function(finished, prefix) {
var wikiFetch = new WikiFetch();
wikiFetch.fetch('Dog', function(err, articleObject) {
equal(true, articleObject.sections['Dog'].text.indexOf('[[Police_dog]]') > 0, prefix + ' links not replaced.');
equal(true, articleObject.sections['Dog'].text.indexOf('[[Domestication_of_the_dog]]') > 0, prefix + ' links not replaced.');
finished();
});
},
'images should be extracted from article sections': function(finished, prefix) {
var wikiFetch = new WikiFetch();
wikiFetch.fetch('Dog', function(err, articleObject) {
equal('http://upload.wikimedia.org/wikipedia/commons/thumb/2/26/YellowLabradorLooking_new.jpg/260px-YellowLabradorLooking_new.jpg', articleObject.sections['Dog'].images[0], prefix + ' article image not extracted.');
equal('http://upload.wikimedia.org/wikipedia/commons/thumb/a/a5/Cavalier_King_Charles_Spaniel_trio.jpg/220px-Cavalier_King_Charles_Spaniel_trio.jpg', articleObject.sections['Types and breeds'].images[0], prefix + ' article image not extracted.');
finished();
});
}
};
};

0 comments on commit 7f86f65

Please sign in to comment.