Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PDFファイルをgrep対象にする #94

Merged
merged 5 commits into from Apr 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -8,6 +8,7 @@ www-data/*.jp
www-data/wget-log*
*.jp
grep_*
!grep_test.sh
*.swp
*.tmp
*.log
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Expand Up @@ -11,6 +11,7 @@ RUN apt-get update && \
fcgiwrap \
squid \
redis-tools \
poppler-utils \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -27,4 +28,4 @@ COPY docker-entrypoint.sh /usr/local/sbin/docker-entrypoint.sh
ENTRYPOINT [ "docker-entrypoint.sh" ]

COPY docker/crawler/bin/redis-cli /usr/local/sbin/redis-cli
EXPOSE 80
EXPOSE 80
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -26,7 +26,7 @@ cp .wgetrc ~/
```

## Setup for macOS
- `brew install wget jq nginx fcgiwrap squid`
- `brew install wget jq nginx fcgiwrap squid poppler`

### Install GNU xargs in macOS

Expand Down
86 changes: 61 additions & 25 deletions crawler/grep.sh
Expand Up @@ -6,11 +6,11 @@ set -e
### ./www-dataに収集した全サイトから新型コロナウイルスに関連するHTMLファイルの一覧を取得するスクリプト
###
### Usage
### ./grep.sh
### ./crawler/grep.sh ${DATA_PATH}
###

# 配列初期化
words=`cat <<EOM
export WORDS=`cat <<EOM
助成
補助
給付
Expand Down Expand Up @@ -42,30 +42,66 @@ words=`cat <<EOM
EOM
`

export INTERMEDIATE_FILE_PATH="./tmp/grep_コロナ.txt.tmp"

rm -f www-data/index.html
remove_exist_index() {
rm -f www-data/index.html
}

set +e
# www-data内の全HTMLファイルをコロナでgrepして中間ファイルに出力
grep -r コロナ --include="*.html" ./www-data |\
# 長過ぎる行は無視
sed '/^.\{1,200\}$/!d' |\
# 半角スペース除去
sed 's/ //g' |\
# 全角スペース除去
sed 's/ //g' |\
# タブ除去
sed 's/[ \t]*//g' |\
# HTMLタグ除去
sed -e 's/<[^>]*>//g' >\
./tmp/grep_コロナ.txt.tmp
set -e
init_intermediate_file() {
echo "" > $INTERMEDIATE_FILE_PATH
}

sanitize_grep_result() {
# 長過ぎる行は無視
sed '/^.\{1,200\}$/!d' |\
# 半角スペース除去
sed 's/ //g' |\
# 全角スペース除去
sed 's/ //g' |\
# タブ除去
sed 's/[ \t]*//g' |\
# HTMLタグ除去
sed -e 's/<[^>]*>//g'
}

set +e
for word in ${words}; do
echo $word
# 中間ファイルを各キーワードでgrepして結果を出力
grep $word ./tmp/grep_コロナ.txt.tmp > ./tmp/grep_コロナ_$word.txt.tmp
done
set -e
export_corona_files() {
set +e
# www-data内の全HTMLファイルをコロナでgrepして中間ファイルに出力
grep -r コロナ --include="*.html" ./www-data | sanitize_grep_result >>\
$INTERMEDIATE_FILE_PATH
set -e
}

export_pdf_corona_files() {
find ./www-data/ -regex '.*\.pdf$' | xargs -n1 -I@ pdftotext @ @.txt
set +e
grep -r コロナ --include="*.pdf.txt" ./www-data |\
sanitize_grep_result |\
sed 's/\.pdf\.txt:/\.pdf:/' >>\
$INTERMEDIATE_FILE_PATH
set -e
}

export_keyword_files() {
set +e
for word in ${WORDS}; do
echo $word
# 中間ファイルを各キーワードでgrepして結果を出力
grep $word $INTERMEDIATE_FILE_PATH > ./tmp/grep_コロナ_$word.txt.tmp
done
set -e
}

main() {
remove_exist_index
init_intermediate_file
export_corona_files
export_pdf_corona_files
export_keyword_files
}


if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main $@
fi
44 changes: 44 additions & 0 deletions test/grep_test.sh
@@ -0,0 +1,44 @@
#!/bin/bash
set -e

. ./crawler/grep.sh
takano32 marked this conversation as resolved.
Show resolved Hide resolved

# test sanitize_grep_result
## remove whitespace chars
input="foo.html:あいうえお かきくけこ さしすせそ たちつてと"
result=$(echo $input | sanitize_grep_result)
echo $result
expect="foo.html:あいうえおかきくけこさしすせそたちつてと"
echo $expect
if [ "$result" = "$expect" ]; then
echo "passed"
else
echo "failed"
exit 1
fi

## remove too long line
input="foo.html:$(seq 100 | xargs)"
result=$(echo $input | sanitize_grep_result)
echo $result
expect=""
echo $expect
if [ "$result" = "$expect" ]; then
echo "passed"
else
echo "failed"
exit 1
fi

## sanitize HTML tags
input="foo.html:<p><a href=\"bar.html\">テキストテキスト<br>テキスト</a></p>"
result=$(echo $input | sanitize_grep_result)
echo $result
expect="foo.html:テキストテキストテキスト"
echo $expect
if [ "$result" = "$expect" ]; then
echo "passed"
else
echo "failed"
exit 1
fi