arex

node.js实现自动提取文章正文，标题，发布日期。自动生成文章摘要.

Node版本

建议v6.17.0

Http 服务

运行

node server.js

客户端链接

curl -X POST -d '{"url":"https://export.shobserver.com/baijiahao/html/411796.html","size":100,"smooth":false}' http://localhost:3824|jq -r .

#安装

npm install arex

#使用例子:

var arex = require('arex');
//example 1, 给定网址自动抓取，提取正文，生成摘要
arex.get_article('http://finance.sina.com.cn/consume/puguangtai/2016-03-15/doc-ifxqhmve9227502.shtml',120,(err,result)=>{
                //120: 摘要长度为120，如果不需要生成摘要此参数传入false.
		//result: {"title":"...","content":"....", "summary":"...", "pubdate":"..."}
		console.log(result['content']);
});

//example 2, 给html内容，提取正文，生成摘要
result = arex.get_article_sync('<html.........</html>',120);//result: {"title":"...","content":"....", "summary":"...", "pubdate":"..."}

//example 3, 给html内容，生成摘要
//summarize(content, exptd_len=120, shingle=false, min=150, max=350, filter=[], title)
//shingle的意义: 以摘要长度的句子组合为单位计算权重，shingle为false则以自然句为单位计算权重, filter是过滤规则，符合规则的段落都会被过滤不作为摘要
var summary = arex.summarize('<html>.......</html>', 120, true);
var summary = arex.summarize('<html>.......</html>', 0.04, true, 100, 300);//摘要长度比例 4%, 最短 100, 最长 300

#测试

##获取源码

git clone https://github.com/ahkimkoo/arex.git

##测试某个网页的抽取

cd arex
npm install
node test/test.js http://finance.sina.com.cn/consume/puguangtai/2016-03-15/doc-ifxqhmve9227502.shtml 120

120表示期望文摘的长度

##算法说明

正文抽取: 基于行块密度分布来抽取正文，每个行块由若干自然段落组成。
标题抽取: 分别从正文附近抽取h1标签，从title标签取值，取最可能是标题的那一个。
发布日期抽取: 用正则表达式抽取正文附近的日期。（有误差）。
自动文摘: sentense rank算法，参照pagerank算法的实现，可以指定期望的文摘长度。优化点：加入了神经网络模型判断一句话是否适合作为摘要。

arex

node.js article extractor, automatic summarization.

#Install

npm install arex

#Usage:

var arex = require('arex');
//example 1
arex.get_article('http://finance.sina.com.cn/consume/puguangtai/2016-03-15/doc-ifxqhmve9227502.shtml',120,(err,result)=>{
                //120: summary limited, if you do not need summary set it to false.
		//result: {"title":"...","content":"....", "summary":"...", "pubdate":"..."}
		console.log(result['content']);
});

//example 2
result = arex.get_article_sync('<html.........</html>',120);//result: {"title":"...","content":"....", "summary":"...", "pubdate":"..."}

//example 3
//summarize(content, exptd_len=120, shingle=false, min=150, max=350, filter=[], title)
var summary = arex.summarize('<html>.......</html>', 120, true);
var summary = arex.summarize('<html>.......</html>', 0.04, true, 100, 300);//summary ratio 4%, min length 100, max length 300

#Test

##get source

git clone https://github.com/ahkimkoo/arex.git

##test link

cd arex
npm install
node test/test.js http://finance.sina.com.cn/consume/puguangtai/2016-03-15/doc-ifxqhmve9227502.shtml 120

##About algorithm

article extractor: based density of article blocks， a bock consists of a number of natual lines.
title extracor: h1 tag or title tag, choose the best one.
pubdate extractor: regex extraction nearby the begging or article.
summarizer: based sentense rank, similar pagerank. Optimization: neural network model to determine whether a sentence is suitable as a summary.

Name		Name	Last commit message	Last commit date
Latest commit History 41 Commits
lib		lib
test		test
.gitignore		.gitignore
Dockerfile		Dockerfile
LICENSE		LICENSE
README.md		README.md
package.json		package.json
server.js		server.js

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

lib

lib

test

test

.gitignore

.gitignore

Dockerfile

Dockerfile

LICENSE

LICENSE

README.md

README.md

package.json

package.json

server.js

server.js

Repository files navigation

arex

Node版本

Http 服务

arex

About

Releases

Packages

Languages

License

ahkimkoo/arex

Folders and files

Latest commit

History

Repository files navigation

arex

Node版本

Http 服务

arex

About

Resources

License

Stars

Watchers

Forks

Languages