Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ZeyuChen committed Jun 15, 2017
0 parents commit f5e2e3a
Show file tree
Hide file tree
Showing 34 changed files with 2,675 additions and 0 deletions.
13 changes: 13 additions & 0 deletions .gitignore
@@ -0,0 +1,13 @@
*.DS_Store
build/

.vscode
.idea
.project
.cproject
.pydevproject
.settings/
.test_env/
third_party/

*~
14 changes: 14 additions & 0 deletions AUTHORS
@@ -0,0 +1,14 @@
# Names should be added to this file like so:
# Name or Organization <email address>

Baidu.com, Inc.

# Initial version authors:
Jiang Di <jiangdi@baidu.com>
Chen Zeyu <chenzeyu01@baidu.com>
Jiang Jiajun <jiangjiajun@baidu.com>
Lian Rongzhong <lianrongzhong@baidu.com>
Li Chen <lichen06@baidu.com>
Bao Siqi <baosiqi@baidu.com>

# Partial list of contributors:
27 changes: 27 additions & 0 deletions LICENSE
@@ -0,0 +1,27 @@
Copyright (c) 2017, Baidu, Inc.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

* Neither the name of the Baidu, Inc. nor the names of it
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
76 changes: 76 additions & 0 deletions Makefile
@@ -0,0 +1,76 @@
ifdef config
include $(config)
endif

ifndef DEPS_PATH
DEPS_PATH = $(shell pwd)/third_party
endif

ifndef PROTOC
PROTOC = ${DEPS_PATH}/bin/protoc
endif

CXX=g++
CXXFLAGS=-pipe \
-W \
-Wall \
-fPIC \
-std=c++11 \
-fno-omit-frame-pointer \
-fpermissive \
-O3 \
-ffast-math \
-funroll-all-loops

INCPATH=-I./include/ \
-I./include/familia \
-I./third_party/include

LDFLAGS_SO = -L$(DEPS_PATH)/lib -L./build/ -lfamilia -lprotobuf -lglog -lgflags

.PHONY: all
all: familia
@echo $(SOURCES)
@echo $(OBJS)
$(CXX) $(CXXFLAGS) $(INCPATH) build/demo/inference_demo.o -Xlinker "-(" $(LDFLAGS_SO) -Xlinker "-)" -o inference_demo
$(CXX) $(CXXFLAGS) $(INCPATH) build/demo/semantic_matching_demo.o -Xlinker "-(" $(LDFLAGS_SO) -Xlinker "-)" -o semantic_matching_demo
$(CXX) $(CXXFLAGS) $(INCPATH) build/demo/word_distance_demo.o -Xlinker "-(" $(LDFLAGS_SO) -Xlinker "-)" -o word_distance_demo
$(CXX) $(CXXFLAGS) $(INCPATH) build/demo/topic_word_demo.o -Xlinker "-(" $(LDFLAGS_SO) -Xlinker "-)" -o topic_word_demo

include depends.mk

.PHONY: clean
clean:
rm -rf inference_demo
rm -rf semantic_matching_demo
rm -rf word_distance_demo
rm -rf topic_word_demo
rm -rf build
find src -name "*.pb.[ch]*" -delete

# third party dependency
deps: ${GLOGS} ${GFLAGS} ${PROTOBUF}
@echo "dependency installed!"

familia: build/libfamilia.a

OBJS = $(addprefix build/, vose_alias.o inference_engine.o model.o vocab.o document.o sampler.o config.o util.o semantic_matching.o tokenizer.o \
demo/inference_demo.o \
demo/semantic_matching_demo.o \
demo/word_distance_demo.o \
demo/topic_word_demo.o)

build/libfamilia.a: include/config.pb.h $(OBJS)
@echo Target $@;
ar crv $@ $(filter %.o, $?)

build/%.o: src/%.cpp
@mkdir -p $(@D)
$(CXX) $(INCPATH) $(CXXFLAGS) -MM -MT build/$*.o $< >build/$*.d
$(CXX) $(INCPATH) $(CXXFLAGS) -c $< -o $@

# build proto
include/config.pb.h src/config.cpp : proto/config.proto
$(PROTOC) --cpp_out=./src --proto_path=./proto $<
mv src/config.pb.h ./include/familia
mv src/config.pb.cc ./src/config.cpp
114 changes: 114 additions & 0 deletions README.md
@@ -0,0 +1,114 @@
# Familia

# 代码编译
第三方依赖包括gflags,glogs,protobuf, 要求编译器支持C++11, 如 g++ >= 4.8
默认情况下会自动获取依赖并安装。

git clone ssh://g@gitlab.baidu.com:8022/chenzeyu01/familia.git
sh build.sh # 包含了拉取并安装第三方依赖

# 模型下载

cd model
sh download_model.sh

# 注意事项
* 若出现找不到libglog.so, libgflags.so等动态库错误,请添加third_party至环境变量的LD_LIBRARY_PATH中。


export LD_LIBRARY_PATH=./third_party/lib:$LD_LIBRARY_PATH


# 运行DEMO
## 文档主题推断

sh run_inference_demo.sh # 运行文档主题推断的demo

执行程序后,通过标准流方式输入文档,每行为一个文档,程序会返回每个文档的主题分布。如下所示

请输入需要推断主题分布的文档:
百度又一次展示了自动驾驶领域领导者的大气风范,发布了一项名为“Apollo(阿波罗)”的新计划,向汽车行业及自动驾驶领域的合作伙伴提供一个开放、完整、安全的软件平台,帮助他们结合车辆和硬件系统,快速搭建一套属于自己的完整的自动驾驶系统。

文档主题分布:
9159:0.103704 4296:0.072840 7486:0.058025 1378:0.037037 1073:0.037037 2414:0.037037 3935:0.034568 5921:0.032099 7380:0.032099 8643:0.032099 4757:0.030864 6808:0.025926 7185:0.022222 4091:0.019753 1167:0.017284 8843:0.016049 5292:0.014815 2507:0.014815 9914:0.013580 2520:0.011111 7658:0.011111 249:0.011111 2017:0.009877 2995:0.008642 4021:0.008642 7163:0.008642 9336:0.007407 1438:0.007407 136:0.007407 7095:0.007407 2313:0.007407 4309:0.007407 1314:0.006173 3573:0.006173 9529:0.006173 477:0.004938 6446:0.004938 281:0.004938 4072:0.004938 9082:0.004938 847:0.004938 27:0.004938 5872:0.004938 2720:0.004938 1322:0.004938 8848:0.003704 7765:0.003704 7838:0.003704 7891:0.003704 7918:0.003704 1592:0.003704 7107:0.003704 1766:0.003704 1812:0.003704 6726:0.003704 6513:0.003704 5660:0.003704 8996:0.003704 1434:0.003704 3407:0.003704 2285:0.003704 500:0.003704 3615:0.003704 3766:0.003704 4704:0.002469 1449:0.002469 9599:0.002469 7779:0.002469 2565:0.002469 7425:0.002469 1665:0.002469 9473:0.002469 9395:0.002469 872:0.002469 8411:0.002469 8606:0.002469 4490:0.002469 8722:0.002469 386:0.002469 4817:0.002469 8826:0.002469 1219:0.002469 75:0.002469 8859:0.002469 7716:0.001235 9280:0.001235 1399:0.001235 9304:0.001235 1:0.001235 9536:0.001235 8099:0.001235 8266:0.001235 1175:0.001235 91:0.001235 5809:0.001235 3087:0.001235 3265:0.001235 3752:0.001235 3832:0.001235 3908:0.001235 2515:0.001235 1046:0.001235 804:0.001235 1953:0.001235 5263:0.001235 428:0.001235 5514:0.001235 5624:0.001235 7696:0.001235 5826:0.001235 5906:0.001235 6196:0.001235 6240:0.001235 6378:0.001235 1896:0.001235 6875:0.001235 6917:0.001235 244:0.001235 7469:0.001235 1516:0.001235 7488:0.001235

其中,冒号前为主题ID,冒号后为该主题的概率,按照主题概率从小到大的方式排序。
可通过更改脚本中--work_dir和--conf_file的配置选择其他模型,如

--work_dir="./model/webpage/" --conf_file="lda.conf" # 选用网页LDA主题模型
--work_dir="./model/webpage/" --conf_file="slda.conf" # 选用网页SentenceLDA主题模型

## 语义匹配计算

sh run_semantic_matching_demo.sh # 运行语义匹配计算的demo

默认为计算短文本与长文本语义匹配模式,运行结果如下所示

请输入短文本:
百度宣布阿波罗计划 开放自动驾驶技术有望改变汽车产业
请输入长文本:
百度又一次展示了自动驾驶领域领导者的大气风范,发布了一项名为“Apollo(阿波罗)”的新计划,向汽车行业及自动驾驶领域的合作伙伴提供一个开放、完整、安全的软件平台,帮助他们结合车辆和硬件系统,快速搭建一套属于自己的完整的自动驾驶系统。
LDA sim = 0.0133234 TWE sim = 0.128288

将脚本中的--mode参数修改为1,则为长文本语义相似度模式, 运行结果如下所示

请输入文档1:
在人工智能发展得最为系统化的硅谷,AI工程师们的薪水远高于其他领域的同行。随着人工智能概念的不断深入人心,人工智能的人才愈发的紧俏,时至今日,大学刚毕业的博士也能坐拥八九十万的年薪,与资深的硅谷工程师相媲美。
请输入文档2:
在国内,部分企业早已瞄准人才的短板,走在了业界的前面。百度是最早进行AI的人才培养布局的,他们同国内诸多高校开展合作,共建工程实验室,在数据开放和资源共享上进行各种合作。这种方式类似美国在人工智能教育领域推行的“硅谷-斯坦福”校企联动模式,一方面斯坦福大学为硅谷提供了人才和科研成果,另一方面硅谷为斯坦福大学提供资金支持和大数据,以助力他们的科研能有更大的突破。
Jensen Shannon Divergence = 1.13397
Hellinger Distance = 0.889616

## 邻近词查询

sh run_word_distance_demo.sh # 运行邻近词查询的demo

执行程序后,通过标准流方式输入词,每行为一个词,程序会返回每个词的最邻近的K个词。如下所示

请输入词语: 篮球
Word Cosine distance
--------------------------------
足球 0.903682
网球 0.842661
羽毛球 0.836915
足球比赛 0.809366
五人制足球 0.799211
美式足球 0.791207
中国足球 0.788995
乒乓球 0.788278
五人制 0.784913
足球新闻 0.783203

其中,每一行为一个词,数字表示该词与输入词的cosine距离,按照从大到小的顺序排序。可通过更改脚本中--work_dir和--conf_file的配置选择其他模型,--top_k配置展现词的个数,如

--work_dir="./model/webpage/" --conf_file="lda.conf" --top_k=10 # 选用网页LDA主题模型,展现距离最近的前10个词
--work_dir="./model/webpage/" --conf_file="slda.conf" --top_k=20 # 选用网页SentenceLDA主题模型,展现距离最近的前20个词

## 主题词查询
在TWE模型中,通过计算主题向量与词向量的cosine相似度可以衡量主题与每个词的相关性,可以每个主题下最邻近的K个词。同理,在LDA模型中,也可以得到每个主题下每个词的产生概率。主题词查询demo展示这两个模型的主题词结果。

sh run_topic_word_demo.sh # 运行主题词查询的demo

执行程序后,通过标准流方式输入主题id,每行为一个id,程序会返回每个主题在TWE跟主题模型下最邻近的K个词的结果。如下所示

请输入主题编号(0-10000): 105
TWE result LDA result
------------------------------------
卫生检疫 国家
检验检疫 出入境
上海口岸 外籍
外经贸部 检验检疫
正式批准 检验检疫局
认监委 国外
卫生注册证书 互认
检验检疫局 要闻
资格认可 奖励旅游
许可制度 公布

其中,每一行为有两次词,第一个词为TWE召回结果,第二个词为主题模型召回结果,按照相关性从大到小的顺序排序。可通过更改脚本中--work_dir和--emb_file的配置选择其他TWE模型,--topic_words_file配置主题模型的主题结果,如

--work_dir="./model/webpage/" --emb_file="webpage_twe_lda.model" --topic_words_file="topic_words.lda.txt" # 选用网页LDA主题模型训练得到TWE模型以及对应的主题展现结果

# 注意事项
* 代码中内置简易的FMM分词工具,只针对主题模型中出现的词表进行正向匹配。该工具仅用于Demo示例使用,若对分词和语义准确度有更高要求,建议使用开源的分词工具, 并使用自定义词表的功能导入主题模型中的词表。

33 changes: 33 additions & 0 deletions benchmark.sh
@@ -0,0 +1,33 @@
export LD_LIBRARY_PATH=./third_party/lib:$LD_LIBRARY_PATH

if [ -d news_t1000 ];then
echo "model file downloaded already"
else
rm -rf news_t1000
mkdir news_t1000

echo "get example model..."
cd news_t1000
wget ftp://nj03-rp-m22nlp062.nj03.baidu.com/home/disk0/chenzeyu01/public/infer_data/news_t1000/*

cd ..
rm -rf input.sample

echo "get input data..."
wget ftp://nj03-rp-m22nlp062.nj03.baidu.com/home/disk0/chenzeyu01/public/infer_data/input.sample
fi

echo "running infer program..."

# ./lda-infer model_path, lda-infer.conf, #burn_in_iter #total_iter
# cat input.merge.sample | ./lda-infer ./news_t1000 lda_infer.conf > infer.result
#cat example/example.txt | ./inference_demo --work_dir="./news_t1000" --conf_file="model.conf" > infer.result
cat example/input.sample | ./test --work_dir="./news_t1000" --conf_file="model.conf" > infer.result
#head input.sample | ./lda-infer ./news_t1000 lda_infer.conf > infer.result

echo "infer result store in infer.result"
python scripts/jsd.py infer.result news_t1000/doc_topic.txt 1000
#python tools/jsd.py infer.slda.mh.result infer.slda.gs.result 1000
#python tools/jsd.py infer.result infer.slda.gs.result 1000
echo "new alias_table jsd / new random / fix_random_seed = 0.135918819452 / 0.146278813358 / 0.135296716886"
echo "original time cost 4.22496 sec"
3 changes: 3 additions & 0 deletions build.sh
@@ -0,0 +1,3 @@
mkdir -p third_party
make deps
make clean && make -j4;
38 changes: 38 additions & 0 deletions depends.mk
@@ -0,0 +1,38 @@
# Install dependencies

URL=http://raw.githubusercontent.com/ZeyuChen/third_party/master/package/
ifndef WGET
WGET = wget --no-check-certificate
endif

# protobuf
PROTOBUF = ${DEPS_PATH}/include/google/protobuf/message.h
${PROTOBUF}:
$(eval FILE=protobuf-2.5.0.tar.gz)
$(eval DIR=protobuf-2.5.0)
rm -rf $(FILE) $(DIR)
$(WGET) $(URL)/$(FILE) && tar -zxf $(FILE)
cd $(DIR) && export CFLAGS=-fPIC && export CXXFLAGS=-fPIC && ./configure --disable-shared -prefix=$(DEPS_PATH) && $(MAKE) && $(MAKE) install
rm -rf $(FILE) $(DIR)
protobuf: | ${PROTOBUF}

GFLAGS = ${DEPS_PATH}/include/google/gflags.h
${GFLAGS}:
$(eval FILE=gflags-2.0-no-svn-files.tar.gz)
$(eval DIR=gflags-2.0)
rm -rf $(FILE) $(DIR)
$(WGET) $(URL)/$(FILE) && tar -zxf $(FILE)
cd $(DIR) && export CFLAGS=-fPIC && export CXXFLAGS=-fPIC && ./configure -prefix=$(DEPS_PATH) && $(MAKE) && $(MAKE) install
rm -rf $(FILE) $(DIR)
gflags: | ${GFLAGS}

# glog
GLOGS = ${DEPS_PATH}/include/glog/logging.h
${GLOGS}:
$(eval FILE=glog-0.3.3.tar.gz)
$(eval DIR=glog-0.3.3)
rm -rf $(FILE) $(DIR)
$(WGET) $(URL)/$(FILE) && tar -zxf $(FILE)
cd $(DIR) && export CFLAGS=-fPIC && export CXXFLAGS=-fPIC && ./configure -prefix=$(DEPS_PATH) --with-gflags=$(DEPS_PATH) && $(MAKE) && $(MAKE) install
rm -rf $(FILE) $(DIR)
glog: | ${GLOGS}

0 comments on commit f5e2e3a

Please sign in to comment.