# Output list of records exported as a workbook

Author: José R. Ferrer-Paris

We want to:
- Read information from the database, and
- Create a workbook with:
    - Authoring information and instruction
    - Table with records for each species (with links to DB?)
    - Trait codes and descriptions
    - Vocabularies
    - List of references


## Setup

These sections include basic set up for the project

### Import modules

In [1]:
# work with paths in operating system
from pathlib import Path
import os

# datetime support
import datetime

# work with xlsx workbooks
import openpyxl
from openpyxl import Workbook
from openpyxl.worksheet.table import Table, TableStyleInfo
from openpyxl.styles import Alignment, PatternFill, Border, Font # Side, Alignment, Protection,
from openpyxl.formatting import Rule
from openpyxl.styles.differential import DifferentialStyle
from openpyxl.worksheet.datavalidation import DataValidation

from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.utils import get_column_letter

# For database connection
from configparser import ConfigParser
import psycopg2
from psycopg2.extras import DictCursor

# Pandas for calculations
import pandas as pd


### Define paths for input and output

In [2]:
repodir = Path("../../") 
inputdir = repodir / "data" / "output-report"
os.listdir(inputdir)

['fireveg-trait-records-model.xlsx',
 'fireveg-trait-records-curation.xlsx',
 'fireveg-field-report-model.xlsx']

### Database connection

Function to parse connection parameters from a file

In [3]:
def read_dbparams(filename,section="postgresql"):
    # create a parser
    parser = ConfigParser()
    # read config file
    parser.read(filename)

    # get section, default to postgresql
    db = {}
    if parser.has_section(section):
        params = parser.items(section)
        for param in params:
            db[param[0]] = param[1]
    else:
        raise Exception('Section {0} not found in the {1} file'.format(section, filename))

    return db


Reading the default parameters for this session:

In [4]:
filename = repodir / 'secrets' / 'database.ini'
dbparams=read_dbparams(filename,section='aws-lght-sl')

### Helper functions

## Read information from database

Connect to the database:

In [5]:
if "conn" not in globals() or conn.closed!=0:
    print('Connecting to the PostgreSQL database...')
    conn = psycopg2.connect(**dbparams)
if "cur" not in globals() or cur.closed:
    cur = conn.cursor(cursor_factory=DictCursor)

Connecting to the PostgreSQL database...


Now we can perform queries to the database and save the results for use later.

The table with trait information can be requested with a simple query:

In [6]:
cur.execute("SELECT code,name,description,value_type,life_stage,life_history_process,priority,method_vocabulary FROM litrev.trait_info ORDER BY code")
trait_info = cur.fetchall()

In [7]:
trait_info[0]

['disp1',
 'Propagule dispersal mode',
 'Propagule dispersal mode',
 'categorical',
 'Seed',
 'Dispersal',
 'needed for VA group',
 'method_disp1_vocabulary']

In [8]:
traitnames=dict()
for k in trait_info:
    traitnames[k[0]]={'name':k[1],'type':k[3],'method':k["method_vocabulary"] is not None}
traitnames['disp1']

{'name': 'Propagule dispersal mode', 'type': 'categorical', 'method': True}

Now we use a general query string to retrieve selected columns for categorical or numerical traits:

In [9]:
qry= """
SELECT "currentScientificName" as spp, "currentScientificNameCode" as sppcode,
    species, species_code,
    '{trait}' as trait_code,
    '{traitname}' as trait_name,
    %s
    %s
    weight as w,
    main_source as refs,
    original_sources as orefs,
    record_id
FROM litrev.{trait} 
LEFT JOIN species.caps
ON species_code="speciesCode_Synonym"
WHERE "currentScientificName" is not NULL AND weight>0
ORDER BY spp;
"""

norm_cat="norm_value::text as val,"
norm_num="ARRAY[best,lower,upper] as val,"
method="method_of_estimation,"
nomethod="NULL as method_of_estimation,"
qryCat=qry % (norm_cat,nomethod)
qryNum=qry % (norm_num,nomethod)
qryCatMet=qry % (norm_cat,method)
qryNumMet=qry % (norm_num,method)


Now we will run a for loop to run the query with different categorical and numerical traits and bind the results by rows (using extend). Here the trait codes are also the names of the tables in the `litrev.` schema.

In [10]:
records=list()
#traits = ['repr2','rect2','repr3']
traits = ['surv1','surv4','surv5','repr2','rect2','disp1','germ1','germ8','repr3','repr3a','repr4',]
colnames = ['scientific name','current code (BioNET)','original name','CAPS code',
            'trait code','trait name','norm value','method','weight','source ref','other ref','recordid']
for trait in traits:
    if traitnames[trait]['type']=='categorical' and traitnames[trait]['method']==False:
        cur.execute(qryCat.format(trait=trait,traitname=traitnames[trait]['name']))
    elif traitnames[trait]['type']=='categorical' and traitnames[trait]['method']==True:
        cur.execute(qryCatMet.format(trait=trait,traitname=traitnames[trait]['name']))
    elif traitnames[trait]['type']=='numeric' and traitnames[trait]['method']==True:
        cur.execute(qryNumMet.format(trait=trait,traitname=traitnames[trait]['name']))
    else:
        cur.execute(qryNum.format(trait=trait,traitname=traitnames[trait]['name']))
    res = cur.fetchall()
    records.extend(res)


Now we can create a dataframe `df` using pandas:

In [11]:
df = pd.DataFrame(records,columns=colnames)
df

Unnamed: 0,scientific name,current code (BioNET),original name,CAPS code,trait code,trait name,norm value,method,weight,source ref,other ref,recordid
0,Abelmoschus moschatus subsp. moschatus,9878,Abelmoschus moschatus,3624,surv1,Resprouting - full canopy scorch,All,,1,austraits-3.0.2,[Clarke Lawes Murphy Russell-Smith Nano Bradst...,24487
1,Abrotanella nivigena,1246,Abrotanella nivigena,1246,surv1,Resprouting - full canopy scorch,All,,1,austraits-3.0.2,[White Sinclair Frood 2020],33536
2,Abrotanella spp.,ABRO,Abrotanella sp.,ABRO,surv1,Resprouting - full canopy scorch,All,,1,austraits-3.0.2,[White Sinclair Frood 2020],33537
3,Abutilon fraseri,3627,Abutilon fraseri,3627,surv1,Resprouting - full canopy scorch,,,1,austraits-3.0.2,[White Sinclair Frood 2020],33539
4,Abutilon grandifolium,3628,Abutilon grandifolium,3628,surv1,Resprouting - full canopy scorch,,,1,austraits-3.0.2,[Clarke Lawes Murphy Russell-Smith Nano Bradst...,24488
...,...,...,...,...,...,...,...,...,...,...,...,...
43933,Platysace linearifolia,1145,Platysace linearifolia,1145,repr4,Maturation age,"[4, None, None]",,1,NSWFFRDv2.1,[Benson 1985],42
43934,Pomaderris discolor,5577,Pomaderris discolor,5577,repr4,Maturation age,"[None, None, 5]",,1,NSWFFRDv2.1,[Benson McDougall Ecology Sydney Plant Species...,43
43935,Pomaderris ferruginea,5579,Pomaderris ferruginea,5579,repr4,Maturation age,"[5, None, None]",,1,NSWFFRDv2.1,[Benson McDougall Ecology Sydney Plant Species...,44
43936,Pomaderris intermedia,7979,Pomaderris intermedia,7979,repr4,Maturation age,"[5, None, None]",,1,NSWFFRDv2.1,[Benson McDougall Ecology Sydney Plant Species...,45


In [12]:
df['method'].unique()

array([None, 'Unspecified methods',
       'Inferred from plant / organ / growth stage morphology',
       'Inferred from related taxa'], dtype=object)

And now we can extract a list of valid references from this data frame:

In [13]:
flat_list=df['source ref'].unique().tolist()
for sublist in df['other ref'].tolist():
    if sublist is not None:
        flat_list.extend(sublist)

valid_refs=tuple(set(flat_list))


And now query the database to include only references in that list:

In [14]:
cur.execute("SELECT ref_code,ref_cite FROM litrev.ref_list WHERE ref_code IN %s ORDER BY ref_code",(valid_refs,))
ref_info = cur.fetchall()

In [15]:
len(ref_info)

268

Now we have all we need from the database, we can close the database connection:

In [16]:
cur.close()
        
if conn is not None:
    conn.close()
    print('Database connection closed.')

Database connection closed.


## Create workbook

### Styles
Define styles to be used across the workbook

In [17]:
cent_align=Alignment(horizontal='center', vertical='center', wrap_text=False)
wrap_align=Alignment(horizontal='left', vertical='top', wrap_text=True)

fontSmall = Font(size = "9")


sheet_colors = {"intro": "1072BA" , "summary": "5AFF5A", "default":"505050", "addentry": "20CA82"}

table_style={"Instructions":TableStyleInfo(name="TableStyleMedium9", showFirstColumn=True, showLastColumn=False, 
                                           showRowStripes=True, showColumnStripes=False),
             "Contributor": TableStyleInfo(name="TableStyleMedium18", showFirstColumn=True,
                       showLastColumn=False, showRowStripes=False, showColumnStripes=False),
             "Lists": TableStyleInfo(name="TableStyleMedium14", showFirstColumn=True,
                       showLastColumn=False, showRowStripes=False, showColumnStripes=False),
             "Info":  TableStyleInfo(name="TableStyleMedium14", showFirstColumn=True,
                       showLastColumn=False, showRowStripes=False, showColumnStripes=False),
             "Vocabularies": TableStyleInfo(name="TableStyleMedium14", showFirstColumn=True,
                       showLastColumn=False, showRowStripes=False, showColumnStripes=False),
             "Entry": TableStyleInfo(name="TableStyleMedium18", showFirstColumn=False,
                       showLastColumn=False, showRowStripes=False, showColumnStripes=False)

             }




### Initialise workbook with worksheets

In [18]:
wb = Workbook()

In [19]:
wsheets = (
    {"title": "About", "colWidths":[("A",90),("B",40)], "tabColor":"intro","active":True},
    {"title": "Summary", "colWidths":[(("A","C","N"),45),
                                      
                                      (("B","D","E","H","I","J","L"),12),
                                      (("F","G","K","M","O"),30)], 
     "tabColor":"summary"},
    {"title": "References", "colWidths":[("A",25),("B",80)], "tabColor":"addentry"},
    {"title": "Trait description", "colWidths":[(("A","D","E","F"),12),(("B","G"),30),("C",70)], "tabColor":"default"}
    )
for item in wsheets:
    if "active" in item.keys():
        ws = wb.active
        ws.title = item['title']
    else:
        ws = wb.create_sheet(item['title'])
    for k in item['colWidths']:
        for j in k[0]:
            ws.column_dimensions[j].width = k[1]
    ws.sheet_properties.tabColor = sheet_colors[item["tabColor"]]


### `About` worksheet

In [20]:
ws = wb["About"]

info = ("Fire Ecology Traits for Plants",
        "Version 1.00 (April 2022)",
        "This data export reflects the status of the database on the %s" % datetime.date.today().strftime('%d %b %Y'),
        "Developed by  José R. Ferrer-Paris and David Keith",
        "Centre for Ecosystem Science / University of New South Wales",
        "Please cite this work as:",
        "Ferrer-Paris, J. R. and Keith, D. A. (2022) Fire Ecology Traits for Plants: A database for fire research and management. Version 1.00. Centre for Ecosystem Science, University of New South Wales, Sydney, Australia.", 
        "DISCLAIMER:",
        "DATA IS NOT READY FOR FINAL USE OR CRITICAL APPLICATIONS AND YOU SHOULD NOT DISTRIBUTE THIS DATA."
        )

k = 1
for row in info:
    ws.cell(k,1,value=row)
    ws.cell(k,1).alignment=wrap_align
    k=k+1
    
ws.cell(1,1).style='Title'
ws.cell(5,1).hyperlink='https://www.unsw.edu.au/research/ecosystem'
ws.cell(5,1).style='Hyperlink'

# Disclaimer
ws.cell(8,1).font=Font(color="FF0000", bold=True,italic=False) 
ws.cell(9,1).font=Font(color="FF0000", italic=True) 


supporters = ({'institution':"University of New South Wales",'url':"https://www.unsw.edu.au/"},
              {'institution':"NSW Bushfire Research Hub",'url':"https://www.bushfirehub.org/"},
              {'institution':"NESP Threatened Species Recovery Hub",'url':"https://www.nespthreatenedspecies.edu.au/"},
              {'institution':"NSW Department of Planning & Environment",'url':"https://www.planning.nsw.gov.au/"})

k=k+2
ws.cell(k-1,1,value="This work has been supported by:")
for item in supporters:
    cell=ws.cell(k,1)
    cell.value=item['institution']
    cell.hyperlink=item['url']
    cell.style = "Hyperlink"
    k=k+1

k=k+2
description = (
              "Taxonomic nomenclature following BioNET (data export from February 2022)",
              "Data in the report is summarised based on BioNET fields 'currentScientificName' and 'currentScientificNameCode'",
              "For general description of the traits, please refer to the 'Trait description' sheet",
              "Vocabularies for categorical traits are available in the 'Vocabularies' sheet",
              "For categorical traits the values in the 'Summary' sheet show the different values reported in the literature records separated by slashes.",
               "If more than one category has been reported, the values are ordered from higher to lower 'weight', categories receiving less than 10% weight are in round brackets, categories with less than 5% in square brackets",
              "The default weight is calculated by multiplying the number of times a value is reported (nr. of records) with the weight given to each record (default to 1), and divided by the weight of all records for a given species.",
              "Default weights  overridden by expert advice to the administrator will be marked, with justification given in the Notes column of the output.",
              "An asterisk (*) in a trait cell indicates a potential data entry error or uncertainty in the assignment of a trait category or value.",
              "'Import/Entry sources' refer to references that were imported directly using automated scripts or manual entry. These include: 1) Primary observations of traits from published research or reports; and 2) Compilations of data (e.g. databases, spreadsheets, published reviews) that include two or more sources of primary observations.",
              "'Indirect sources' refer to references that were cited in Import/Entry sources, where the latter are compilations of multiple primary sources (see Import/Entry sources). Information from indirect sources may have been modified when it was incorporated into those compilations. The original source of primary trait observations has not yet been verified prior to import into this database. When the primary source is reviewed and the trait values are verified, these records will be attributed to the primary source as 'Import/Entry sources'.",
              "Some sheets are protected to avoid accidental changes, but they are not password protected. If you need to filter and reorder entries in the table, please unprotect the sheet first.",
              )

for row in description:
    ws.cell(k,1,value=row)
    ws.cell(k,1).alignment=wrap_align
    k=k+1
    
ws.protection.sheet = True

### `Trait description` worksheet

In [21]:
ws = wb["Trait description"]

k=1
description = ("The following table gives a general description of the traits used in the 'Summary' sheet",
               "This sheet is protected to avoid accidental changes, but it is not password protected. If you need to filter and reorder entries in the table, please unprotect the sheet first.",
              "Vocabularies for categorical traits are available in the 'Vocabularies' sheet","","")

for row in description:
    ws.cell(k,3,value=row)
    ws.cell(k,3).alignment=wrap_align
    k=k+1
    
tab_begin=k
ws.append(["Trait Code", "Trait Name", "Description", "Type", "Life stage", "Life history process", "Data migration"])
k=k+1

for row in trait_info:
    j=1
    for key in ["code","name","description","value_type","life_stage","life_history_process","priority"]:
        val=row[key]
        ws.cell(row=k, column=j, value=val)
        j=j+1
    k=k+1
        
for j in range(tab_begin,ws.max_row+1):
    ws.cell(j,3).alignment=wrap_align
    
tab = Table(displayName="TraitInformation", ref="A{}:G{}".format(tab_begin,ws.max_row))

tab.tableStyleInfo = table_style["Info"]
ws.add_table(tab)
ws.protection.sheet = True

### `Summary` worksheet

In [22]:

ws = wb["Summary"]

colnames = ['scientific name','current code (BioNET)',
            'original name (as entered)','CAPS code (old)',
            'trait code','trait name','norm value',
            'best','lower','upper',
            'method of estimation',
            'weight','source ref','other ref','DB link']
ws.append(colnames)

rows = df.sort_values(by =['scientific name','trait code']).to_dict(orient="records")

    
for r_idx, row in enumerate(rows, 2):
    
    ws.cell(row=r_idx, column=1, value=row['scientific name'])
    ws.cell(r_idx,1).font  = Font(italic=True)
    
    ws.cell(row=r_idx, column=2, value=row['current code (BioNET)'])
    if row['original name'] != row['scientific name']:
        ws.cell(row=r_idx, column=3, value=row['original name'])
        ws.cell(r_idx,3).font  = Font(italic=True, color="110000")
        ws.cell(row=r_idx, column=4, value=row['CAPS code'])
    ws.cell(row=r_idx, column=5, value=row['trait code'])
    ws.cell(row=r_idx, column=6, value=row['trait name'])
    
    ws.cell(row=r_idx, column=11, value=row['method'])
    ws.cell(row=r_idx, column=12, value=row['weight'])
    ws.cell(row=r_idx, column=13, value=row['source ref'])
    if row['other ref'] is not None:
        oref="; ".join(row['other ref'])
        ws.cell(row=r_idx, column=14, value=oref)
    
    if isinstance(row['norm value'],str):
        val=row['norm value']
    elif row['norm value'] is None:
        val="(data input ERROR)"
    else:
        triplet=row['norm value']
        k=7
        for j in triplet:
            k=k+1
            if j is not None:
                ws.cell(row=r_idx, column=k, value=j)
        if triplet[0] is not None:
            if triplet[1] is None and triplet[2] is None:
                val=triplet[0]
            else:
                val = "%s (%s -- %s)" % tuple(triplet)
        else:
            if triplet[1] is None:
                if triplet[2] is None:
                    val="(data input ERROR)"
                else:
                    val = "<%s" % triplet[2]
            elif triplet[2] is None:
                val = ">%s" % triplet[1]
            else:
                val = "(%s -- %s)" % (triplet[1],triplet[2])
    ws.cell(row=r_idx, column=7, value=val)

    val = "trait:%s / sp code:%s / record id:%s" % (row['trait code'],row['CAPS code'],row['recordid'])
    url = "http://13.54.3.205/traits/%s/%s" % (row['trait code'],row['CAPS code'])
    cell=ws.cell(row=r_idx, column=15, value=val)
    cell.hyperlink=url
    cell.style='Hyperlink'
    
    for j in (2,4,5,7,12):
        ws.cell(r_idx,j).alignment=cent_align
    for j in (11,13,14,15):
        ws.cell(r_idx,j).font = fontSmall
        ws.cell(r_idx,j).alignment=wrap_align
  

tab = Table(displayName="Summary", ref="A1:{}{}".format(get_column_letter(15),r_idx))
tab.tableStyleInfo = table_style["Lists"]
ws.add_table(tab)


### `References` worksheet

In [23]:
ws = wb["References"]

k=1
description = ("The following table includes bibliographical information for the sources referenced in the 'Summary' sheet",
               "This sheet is protected to avoid accidental changes, but it is not password protected. If you need to filter and reorder entries in the table, please unprotect the sheet first.",
              "","")

for row in description:
    ws.cell(k,2,value=row)
    ws.cell(k,2).alignment=wrap_align
    k=k+1
    

ws.append(["Reference code", "Reference information"])

for row in ref_info:
    ws.append(row)
    
#ws.max_row
for j in range(k+1,ws.max_row+1):
    ws.cell(j,2).alignment=wrap_align
    ws.cell(j,2).font = fontSmall
    
tab = Table(displayName="ReferenceInformation", ref="A{}:B{}".format(k,ws.max_row))

tab.tableStyleInfo = table_style["Lists"]
ws.add_table(tab)
ws.protection.sheet = True

### Save workbook

In [24]:
wb.save(inputdir / "fireveg-trait-records-model.xlsx")