akflora/dataflow/PAF at master · camwebb/akflora

History

Name		Name	Last commit message	Last commit date
parent directory ..
LICENSE		LICENSE
README.sh		README.sh
create_syn_lookup.awk		create_syn_lookup.awk
p1.patch		p1.patch
paf		paf
paf.sql		paf.sql
paf_ak		paf_ak
parse_syns.awk		parse_syns.awk
syn_parse_tests		syn_parse_tests

README.sh

# Generate an Alaska names list from the Panarctic Flora

# See LICENSE file for data licensing

# Data history: In 2018, with permission of the PAF editors, Christian
#   Svindseth made the sqlite3 file running http://panarcticflora.org/
#   available to Cam Webb. Webb converted the encoding to UTF-8 and
#   loaded the data into a mysql/mariaDB DB. In Dec 2018 Reidar Elven
#   authorized the sharing of the raw data. The `paf.sql` file in this
#   github repo is the first public uploading of these data.

# paf.sql made with:
#   mysqldump -R -u ... -p... paf | sed 's/),(/),\n (/g' > paf.sql
# (The breaking of lines allows diff to find updates)

source ../ENV.sh


# 1. Load PAF data into a mysql/mariaDB database

echo "Loading PAF data into DB" > "/dev/stderr"
mysql -u $AKFLORA_DBUSER -p$AKFLORA_DBPASSWORD --show-warnings < paf.sql

# 2. Query the db

# this is needed to allow the connection to close - some sort of bug
#  --connect-timeout=1 does not help, exec <&- does not help
echo "(waiting 10 seconds)"
sleep 10
echo "(back again)"

# the entries.notes field has the references for the synonymy
echo "SELECT paf_id, ranking, name, label, subspecies, variety, 
             author, original_author, syns , specifier, affinis
      FROM entries 
      LEFT JOIN 
        ( SELECT entry_id , GROUP_CONCAT(\`raw\` SEPARATOR '^') AS syns 
          FROM \`references\` 
          GROUP BY entry_id ) AS b 
        ON entries.id = b.entry_id 
      LEFT JOIN 
        ( SELECT DISTINCT entry_id AS inak FROM entries 
            LEFT JOIN distributions ON entries.id = distributions.entry_id
            WHERE ((area = 'AN' OR area = 'AW') AND 
                    distribution != '-') OR geography LIKE '%ALA%') AS c
        ON entries.id = c.inak
      WHERE ( ranking = 'species' OR ranking = 'subspecies' ) AND
        c.inak IS NOT NULL;" | mysql -N -u $AKFLORA_DBUSER \
        -p$AKFLORA_DBPASSWORD --show-warnings paf | sed 's/NULL//g' > paf.1


# 3. Parse the synonyms out of the PAF text

# NB: The PAF synonyms are in the references table.
# I used the file `syn_parse_tests` to help construct the logic of the
# parse_syns.awk script.

gawk -f parse_syns.awk paf.1 > paf.2

# 4. Manual check (use datacheck-mode.el to assist) and build a patch

# cp paf.2 paf.3
# emacs paf.3
# diff paf.2 paf.3 > p1.patch

# NB: some duplicates were present:
# cat paf.3 | sort | uniq -d
#  100101   | Lastrea dilatata var. alpina Moore
#  3309044  | Carex capillaris var. robustior Drejer ex Lange
#  340602   | Hordeum boreale Scribn. & J.G. Sm.
#  420408   | Stellaria biflora L.
#  671611   | Cardamine digitata Richardson
# also:
# Warning: paf-340906, Hierochloë pauciflora R. Br. is both acc and syn
# Warning: paf-671611, Cardamine digitata Richardson is both acc and syn

echo "Manual checking of PAF data skipped..."

# 5. Run the patch to automate the manual editing

patch -o paf.3 paf.2 p1.patch
if [ $? -ne 0 ] ; then exit ; fi


# 6. Add a unique code to syn and create lookup

gawk -i "taxon-tools.awk" -f create_syn_lookup.awk > paf

# NB: Allow this:
# Warning: paf-343001, Dupontia fisheri R. Br. is both acc and syn
#  (paf-343001b-s1)

# 7. Make the ref table (could have done this earlier, in the first query)

echo "SELECT paf_id, notes FROM entries" | mysql -N -u $AKFLORA_DBUSER \
        -p$AKFLORA_DBPASSWORD --show-warnings paf | sed 's/NULL//g' | sed 's/\t/|/g' > paf.4

gawk 'BEGIN{FS=OFS="|"; while ((getline < "paf.4") > 0) notes[$1]=$2; while ((getline < "paf") > 0) { if ($9 == "accepted") id = $1 ; else id = $9; split(id,ix,"-"); print $1, $9, notes[ix[2]] }}' > paf_refs

# 8. Clean up
rm -f paf.1 paf.2 paf.3 paf.4



# Notes:

# This script is useful for accessing PAF info:
# echo "select paf_id,ranking,name,label,subspecies,variety,author,\
#   original_author, syns from entries left join ( select entry_id , \
#   GROUP_CONCAT(\`raw\` SEPARATOR '^') as syns from \`references\` \
#   group by entry_id ) as b on entries.id = b.entry_id where \
#   ( ranking = 'species' or ranking = 'subspecies' ) \
#   AND entries.paf_id = '$1';" | mysql -N -u cam -ptesttest paf | \
#     sed 's/NULL//g'

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

PAF

PAF

LICENSE

LICENSE

README.sh

README.sh

create_syn_lookup.awk

create_syn_lookup.awk

p1.patch

p1.patch

paf

paf

paf.sql

paf.sql

paf_ak

paf_ak

parse_syns.awk

parse_syns.awk

syn_parse_tests

syn_parse_tests

README.sh

Files

PAF

Directory actions

More options

Directory actions

More options

Latest commit

History

PAF

Folders and files

parent directory