# Read files summarising field work and update database
These Excel workbooks were imported on February 2022.

The scripts documented here have been created to:

- Read data from spreadsheets with field-work data
- Create records for data import into the database
- Insert or update records in the database


## Set-up
Load libraries 

In [1]:
import openpyxl
from pathlib import Path
import os
from datetime import datetime
from configparser import ConfigParser
import psycopg2
from psycopg2.extras import DictCursor
from psycopg2.extensions import AsIs
#import postgis

Define path to workbooks

In [2]:
repodir = Path("../../") 
inputdir = repodir / "data" / "field-form"

 ### DB connection parameters and helper functions
 
 Database credentials are stored in a database.ini file

In [3]:
filename = repodir / 'secrets' / 'database.ini'
section = 'aws-lght-sl'

# create a parser
parser = ConfigParser()
# read config file
parser.read(filename)

# get section, default to postgresql
db = {}
if parser.has_section(section):
    params = parser.items(section)
    for param in params:
        db[param[0]] = param[1]
else:
    raise Exception('Section {0} not found in the {1} file'.format(section, filename))

params = db

Get updated vocabularies from database

In [4]:
# connect to the PostgreSQL server
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(**params)
cur = conn.cursor()
#valid_organ=('Epicormic', 'Apical', 'Lignotuber', 'Basal','Tuber','Tussock','Short rhizome', 'Long rhizome or root sucker', 'Stolon', 'None', 'Other')
#valid_seedbank=('Soil-persistent', 'Transient', 'Canopy','Non-canopy','Other')

cur.execute("SELECT enumlabel FROM pg_enum e LEFT JOIN pg_type t ON e.enumtypid=t.oid where typname='resprout_organ_vocabulary';")
valid_organ_list = cur.fetchall()
organ_vocab = [item for t in valid_organ_list for item in t]

cur.execute("SELECT enumlabel FROM pg_enum e LEFT JOIN pg_type t ON e.enumtypid=t.oid where typname='seedbank_vocabulary';")
valid_seedbank_list = cur.fetchall()
seedbank_vocab = [item for t in valid_seedbank_list for item in t]

cur.close()
        
if conn is not None:
    conn.close()
    print('Database connection closed.')

Connecting to the PostgreSQL database...
Database connection closed.


Define a function to batch process insert or update queries:

In [5]:
def batch_upsert(params,table,records,keycol,idx, execute=False,useconn=None):
    if useconn is None:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params)
    else:
        conn = useconn
    cur = conn.cursor()
    #postgis.register(cur)
    updated_rows=0

    for record in records:
        if len(record.keys())>len(keycol):
            if 'geom' in record.keys():
                the_geom=record['geom']
                record['geom']='GEOMSTR'
            if idx is not None:
                qrystr = "INSERT INTO %s (%s) values %s ON CONFLICT ON CONSTRAINT %s DO UPDATE SET %s"
                upd=list()
                for k in record.keys():
                    if k not in keycol:
                        upd.append("{col}=EXCLUDED.{col}".format(col=k))
                qry = cur.mogrify(qrystr, (AsIs(table),
                                AsIs(','.join(record.keys())),
                                tuple(record.values()),
                                AsIs(idx),
                                AsIs(','.join(upd))
                               ))
            else:
                qrystr = "INSERT INTO %s (%s) values %s ON CONFLICT DO NOTHING"
                qry = cur.mogrify(qrystr, (AsIs(table),
                                AsIs(','.join(record.keys())),
                                tuple(record.values())
                               ))

            if 'geom' in record.keys():
                qry=qry.decode('utf-8')
                qry=qry.replace("'GEOMSTR'",the_geom)
                record['geom']=the_geom

            if execute:
                cur.execute(qry)
                if cur.rowcount > 0:
                    updated_rows = updated_rows + cur.rowcount
            else:
                print(qry)
            
    conn.commit()        
    cur.close()
    print("%s rows updated" % (updated_rows))
        
    if useconn is None and conn is not None:
        conn.close()
        print('Database connection closed.')

        

Just a test with random data, use `execute=False` to print the query:

In [6]:
record={'site_label':'test','geom':"ST_GeomFromText('POINT(1 2)', 4326)"}
batch_upsert(params,"form.field_site",(record,),keycol=('site_label',), idx='field_site_pkey1',execute=False)

Connecting to the PostgreSQL database...
INSERT INTO form.field_site (site_label,geom) values ('test', ST_GeomFromText('POINT(1 2)', 4326)) ON CONFLICT ON CONSTRAINT field_site_pkey1 DO UPDATE SET geom=EXCLUDED.geom
0 rows updated
Database connection closed.


In [7]:
batch_upsert(params,"form.field_site",(record,),keycol=('site_label',), idx=None,execute=False)

Connecting to the PostgreSQL database...
INSERT INTO form.field_site (site_label,geom) values ('test', ST_GeomFromText('POINT(1 2)', 4326)) ON CONFLICT DO NOTHING
0 rows updated
Database connection closed.


## Read workbooks
Each spreadsheet has a slightly different structure, so these scripts have to be adapted for each case.

### List of workbooks/spreadsheets in directory

In [8]:
os.listdir(inputdir)

['~$Fire response quadrat survey Newnes Nov2020_DK_revised IDs+AllNovData.xlsm',
 'SthnNSWRF_data_bionet2.xlsx',
 '~$UNSW_VegFireResponse_DataEntry_Yatteyattah all +DK +Milton.xlsx',
 'UNSWFireVegResponse_UplandBasalt_AlexThomsen+DK.xlsx',
 'UNSW_VegFireResponse_RMK_reformat_Sep2021a.xlsx',
 'UNSW_VegFireResponse_DataEntry_Yatteyattah all +DK +Milton.xlsx',
 '~$UNSW_VegFireResponse_RMK_reformat_Sep2021a.xlsx',
 'UNSW_VegFireResponse_KNP AlpAsh.xlsx',
 'UNSW_VegFireResponse_AlpineBogs_reformat_Sep2021.xlsx',
 'RobertsonRF_data_bionet2.xlsx',
 'Fire response quadrat survey Newnes Nov2020_DK_revised IDs+AllNovData.xlsm']

In [9]:
valid_files = ['SthnNSWRF_data_bionet2.xlsx',
               'UNSWFireVegResponse_UplandBasalt_AlexThomsen+DK.xlsx',
               'UNSW_VegFireResponse_RMK_reformat_Sep2021a.xlsx',
               'UNSW_VegFireResponse_DataEntry_Yatteyattah all +DK +Milton.xlsx',
               'UNSW_VegFireResponse_KNP AlpAsh.xlsx',
               'UNSW_VegFireResponse_AlpineBogs_reformat_Sep2021.xlsx',
               'RobertsonRF_data_bionet2.xlsx',
               'Fire response quadrat survey Newnes Nov2020_DK_revised IDs+AllNovData.xlsm']

Here we create an index of worksheets and column headers for each file

In [10]:
wbindex=dict()
for workbook_name in valid_files:
    inputfile=inputdir / workbook_name
    # using data_only=True to get the calculated cell values
    wb = openpyxl.load_workbook(inputfile,data_only=True)
    wbindex[workbook_name]=dict()
    for ws in wb.worksheets:
        wbindex[workbook_name][ws._WorkbookChild__title]=[list(),list()]
        for k in range(1,ws.max_column):
            wbindex[workbook_name][ws._WorkbookChild__title][0].append(ws.cell(row=1,column=k).value)
            wbindex[workbook_name][ws._WorkbookChild__title][1].append(ws.cell(row=2,column=k).value)
        

In [11]:
wbindex.keys()

dict_keys(['SthnNSWRF_data_bionet2.xlsx', 'UNSWFireVegResponse_UplandBasalt_AlexThomsen+DK.xlsx', 'UNSW_VegFireResponse_RMK_reformat_Sep2021a.xlsx', 'UNSW_VegFireResponse_DataEntry_Yatteyattah all +DK +Milton.xlsx', 'UNSW_VegFireResponse_KNP AlpAsh.xlsx', 'UNSW_VegFireResponse_AlpineBogs_reformat_Sep2021.xlsx', 'RobertsonRF_data_bionet2.xlsx', 'Fire response quadrat survey Newnes Nov2020_DK_revised IDs+AllNovData.xlsm'])

In [13]:
wbindex['SthnNSWRF_data_bionet2.xlsx'].keys()

dict_keys(['Site', 'Fire', 'Structure', 'Floristics', 'Reference', 'Info', 'Sheet1'])

In [19]:
wbindex['SthnNSWRF_data_bionet2.xlsx']['Fire'][0][0:11]

['Site',
 'Replicate',
 'Date of last fire dd/mm/yyyy',
 'Date of penultimate fire',
 'Date of earlier fire',
 'How date inferred1',
 'How date inferred2',
 'How date inferred3',
 'Ignition cause1',
 'Ignition cause2',
 'Ignition cause3']

In [21]:
filename='SthnNSWRF_data_bionet2.xlsx'
worksheet='Fire'
wb = openpyxl.load_workbook(inputdir / filename,data_only=True)
ws = wb[worksheet]

In [74]:
col_dicts=[{'site_label':0,'fire_date':2,'how_inferred':5,'cause_of_ignition':8},
    {'site_label':0,'fire_date':3,'how_inferred':6,'cause_of_ignition':9},
    {'site_label':0,'fire_date':4,'how_inferred':7,'cause_of_ignition':10}]
records=list()
for sw in col_dicts:
    for j in range(2,ws.max_row+1):
        item = ws[j]
        record=dict()
        comms=list()
        for k in sw.keys():
            vals=item[sw[k]].value
            if vals is not None:
                if k == 'fire_date':
                    if isinstance(vals,datetime):
                        record['fire_date']=str(vals.date())
                        record['earliest_date']=vals.date()
                        record['latest_date']=vals.date()
                    else:
                        record['fire_date']=str(vals)
                        comms.append('Fire date given as: %s' % vals)
                else:
                    record[k]=vals
        if len(comms)>0:
            record['notes'] = comms
        if len(record)>1:
            records.append(record)


In [75]:
records[10]

{'site_label': 'DeuaRF',
 'fire_date': '>1975-76',
 'how_inferred': 'NPWS fire records prescribed burn but unlikely to have reached site',
 'cause_of_ignition': 'prescribed',
 'notes': ['Fire date given as: >1975-76']}

In [79]:
batch_upsert(params,"form.fire_history",records,keycol=('site_label','fire_date'), idx=None,execute=True)

Connecting to the PostgreSQL database...
10 rows updated
Database connection closed.


#### Create fire history records

This is a lower level function that will create a field sample record from an `item` (a row in the spreadsheet), using the dictionary or "switch" in `col_dicts`:

In [113]:
def create_fire_history_record(item,col_dicts):
    records=list()
    for sw in col_dicts:
        record=dict()
        comms=list()
        if item[sw['site_label']].value == 'Site':
            continue
        for k in sw.keys():
            vals=item[sw[k]].value
            if vals is not None:
                if k == 'fire_date':
                    if isinstance(vals,datetime):
                        record['fire_date']=str(vals.date())
                        record['earliest_date']=vals.date()
                        record['latest_date']=vals.date()
                    else:
                        record['fire_date']=str(vals)
                        comms.append('Fire date given as: %s' % vals)
                else:
                    record[k]=vals
        if len(comms)>0:
            record['notes'] = comms
        if len(record)>1:
            records.append(record)
    return records

In [114]:
def import_records_from_workbook(filepath, workbook, worksheet, col_dictionary, create_record_function, **kwargs):
    wb = openpyxl.load_workbook(filepath / workbook, data_only=True)
    ws=wb[worksheet]
    row_count = ws.max_row+1
    records=list()
    for k in range(2,row_count):
        item=ws[k]
        record=create_record_function(item,col_dictionary,**kwargs)
        if record is not None:
            if type(record)==list:
                records.extend(record)
            elif type(record)==dict:
                records.append(record)
    return records


In [115]:
col_dicts=[{'site_label':0,'fire_date':2,'how_inferred':5,'cause_of_ignition':8},
    {'site_label':0,'fire_date':3,'how_inferred':6,'cause_of_ignition':9},
    {'site_label':0,'fire_date':4,'how_inferred':7,'cause_of_ignition':10}]
records = import_records_from_workbook(inputdir, filename, worksheet, col_dicts, create_record_function=create_fire_history_record)

In [116]:
len(records)

66

In [90]:
batch_upsert(params,"form.fire_history",records,keycol=('site_label','fire_date'), idx=None,execute=True)

Connecting to the PostgreSQL database...
0 rows updated
Database connection closed.


In [94]:
filename='UNSWFireVegResponse_UplandBasalt_AlexThomsen+DK.xlsx'
wbindex[filename][worksheet][0][0:11]

['Site',
 'Replicate',
 'Date of last fire dd/mm/yyyy',
 'Date of penultimate fire',
 'Date of earlier fire',
 'How date inferred1',
 'How date inferred2',
 'How date inferred3',
 'Ignition cause1',
 'Ignition cause2',
 'Ignition cause3']

In [117]:
col_dicts=[{'site_label':0,'fire_date':2,'how_inferred':5,'cause_of_ignition':8},
    {'site_label':0,'fire_date':3,'how_inferred':6,'cause_of_ignition':9},
    {'site_label':0,'fire_date':4,'how_inferred':7,'cause_of_ignition':10}]
records = import_records_from_workbook(inputdir, filename, worksheet, col_dicts, create_record_function=create_fire_history_record)

In [118]:
len(records)

66

In [119]:
records[0:3]

[{'site_label': 'CRC09B7UVH',
  'fire_date': '2020-02-10',
  'earliest_date': datetime.date(2020, 2, 10),
  'latest_date': datetime.date(2020, 2, 10),
  'how_inferred': 'NPWS',
  'cause_of_ignition': 'lightning'},
 {'site_label': 'CRC09B7UVH',
  'fire_date': '2006-11-24',
  'earliest_date': datetime.date(2006, 11, 24),
  'latest_date': datetime.date(2006, 11, 24),
  'how_inferred': 'NPWS',
  'cause_of_ignition': 'Wildfire'},
 {'site_label': 'CRC09B7UVH',
  'fire_date': '1986',
  'how_inferred': 'NPWS',
  'cause_of_ignition': 'Wildfire',
  'notes': ['Fire date given as: 1986']}]

In [121]:
batch_upsert(params,"form.fire_history",records,keycol=('site_label','fire_date'), idx=None,execute=True)

Connecting to the PostgreSQL database...
0 rows updated
Database connection closed.


In [122]:
filename='UNSW_VegFireResponse_RMK_reformat_Sep2021a.xlsx'
wbindex[filename][worksheet][0][0:11]

['Site',
 'Replicate',
 'Date of last fire dd/mm/yyyy',
 'Date of penultimate fire',
 'Date of earlier fire',
 'How date inferred1',
 'How date inferred2',
 'How date inferred3',
 'Ignition cause1',
 'Ignition cause2',
 'Ignition cause3']

In [123]:
col_dicts=[{'site_label':0,'fire_date':2,'how_inferred':5,'cause_of_ignition':8},
    {'site_label':0,'fire_date':3,'how_inferred':6,'cause_of_ignition':9},
    {'site_label':0,'fire_date':4,'how_inferred':7,'cause_of_ignition':10}]
records = import_records_from_workbook(inputdir, filename, worksheet, col_dicts, create_record_function=create_fire_history_record)
batch_upsert(params,"form.fire_history",records,keycol=('site_label','fire_date'), idx=None,execute=True)

Connecting to the PostgreSQL database...


ForeignKeyViolation: insert or update on table "fire_history" violates foreign key constraint "fire_history_site_label_fkey"
DETAIL:  Key (site_label)=(BC_2_UNSW) is not present in table "field_site".


In [131]:
batch_upsert(params,"form.fire_history",records[0:48],keycol=('site_label','fire_date'), idx=None,execute=True)

Connecting to the PostgreSQL database...
8 rows updated
Database connection closed.


In [134]:
records[48:] # this site is missing from the field_sites table

[{'site_label': 'BC_2_UNSW',
  'fire_date': 'Jan. 2020',
  'how_inferred': 'NPWS',
  'cause_of_ignition': 'wildfire',
  'notes': ['Fire date given as: Jan. 2020']},
 {'site_label': 'BC_2_UNSW',
  'fire_date': '0',
  'how_inferred': '?',
  'cause_of_ignition': '?',
  'notes': ['Fire date given as: 0']},
 {'site_label': 'BC_2_UNSW',
  'fire_date': '?',
  'how_inferred': '?',
  'cause_of_ignition': 'N/A',
  'notes': ['Fire date given as: ?']}]