-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_proc.sh
executable file
·37 lines (27 loc) · 1021 Bytes
/
run_proc.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash
# data paths
azurepath='../blitstore/blitshare'
wileypdf="$azurepath/data/wiley/pdf"
wileyhtml="$azurepath/data/wiley/html"
tmppath="$azurepath/data/tmp"
reportpath="$azurepath/reports/scraper"
# postgres
pgpath="$azurepath/pg"
pgfile="$pgpath/param.txt"
# taxonomy and model files
birdfile="$azurepath/data/BirdLife_species_list_Jan_2022.xlsx"
blimodelfile="$azurepath/data/bli_model_bow_11107.json"
########################
# PROCESS STAGE
# (1) date correction from (CrossRef) 'dois' table and normalisation
python3 ./process/fix_dates.py $pgfile
# (2) language detection using SpaCy
python3 ./process/detect_language.py $pgfile
# (3) find species references in all text
python3 ./process/find_species.py $pgfile $birdfile
# (4) pass non-English text to Azure for translation
python3 ./process/translate_to_english.py $pgfile
# (5) score title/abstract (not pdftext at this stage) on BLI text model
python3 ./process/score_for_topic.py $pgfile $blimodelfile
# report
echo "Processing complete."