-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_scan.sh
executable file
·83 lines (63 loc) · 2.18 KB
/
run_scan.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
# today's date
today=`date +'%Y-%m-%d'`
date_mod_10=$((`date +'%d'`%10))
# data paths
azurepath='../blitstore/blitshare'
wileypdf="$azurepath/data/wiley/pdf"
wileyhtml="$azurepath/data/wiley/html"
tmppath="$azurepath/data/tmp"
reportpath="$azurepath/reports/scraper"
pdfpath="$azurepath/data/pdf"
wwwpath="./webapp/www/upload"
# postgres
pgpath="$azurepath/pg"
pgfile="$pgpath/param.txt"
########################
# open access to Azure file share (Mac OS)
#open -g $AZURE_VOLUME
########################
# SCRAPE STAGE
# Scrape I: collect URLs
# (1) Bing custom search
#python3 ./scrape/custom_search_bing.py $pgfile
# (2) run searches for vulnerable genera against archives (bioRxiv, J-Stage etc)
# - maintain source list for this step
Rscript ./scrape/archive_indexes.R $pgfile
# (3) scan OpenAlex for individual species
Rscript ./scrape/scan_openalex.R $pgfile
if [ $date_mod_10 -eq 0 ]
then
# (4) scan OAI relevant journals (currently under BioOne)
# - maintain source list for this step
Rscript ./scrape/scan_oai_sources.R $pgfile
# (5) directed (bespoke per-journal) search where permitted
# - maintain source list for this step
Rscript ./scrape/journal_indexes.R $pgfile
# (5a) ... including Wiley ConBio
# DOIs are extracted directly here
Rscript ./scrape/scan_conbio.R $pgfile $wileyhtml
else
echo "Skipping journal scan ..."
fi
# (6) extract (other) DOIs from article URLs
python3 ./scrape/find_link_dois.py $pgfile
########################
# Scrape II: collect text
# (1) web scraping against links where text not already obtained
Rscript ./scrape/get_html_text.R $pgfile
# (2) update DOI database from CrossRef
# (3) get Wiley PDFs
# both run locally in 'run_app_update.sh'
# (4a) scan manually uploaded PDFs
python3 ./scrape/read_pdf_uploads.py $pgfile $pdfpath $wwwpath
# (4b) scan Wiley PDFs and get text
python3 ./scrape/read_wiley_pdf.py $pgfile $wileypdf
# (5) ... and PDF links for other domains
python3 ./scrape/get_pdf_text.py $pgfile $tmppath
# (6) remove duplicate records
# i.e. different links for same title/abstract,
# different doi records with same DOI
./scrape/remove_duplicates.sh $pgfile
# report
echo "Scan complete."