# Read NSWFRD 2014
Read the spreadsheet from NSW Flora Fire response database and extract hyperlinks that point to references.
We will use the openpyxl library

In [2]:
import openpyxl
from pathlib import Path
import os

In [3]:
repodir = Path("../") 
inputdir = repodir / "data/"

## Open the workbook and read main spreadsheet
Here we will load the workbook:

In [4]:
wb = openpyxl.load_workbook(inputdir / "NSWFFRDv2.1.xlsx")

Use the sheet name to read data

In [5]:
ws = wb['SpeciesData']

Let's look at all the values in the second column (Species code)

In [6]:
ws['B2'].value

'Species Code'

We want to count the number of unique values:

In [7]:
row_count = ws.max_row
column_count = ws.max_column
j=2
unique_list = list()
unique_items = 0
for i in range(1, row_count + 1):
    item = ws.cell(row=i, column=j).value
    if item not in unique_list and item is not None:
        unique_list.append(item)
        unique_items += 1
print(unique_items)

3000


Which columns include information on traits? this will print out the names in the second row...:

In [8]:
for j in range(1, column_count + 1):
    print(ws.cell(row=2, column=j).value)


Current Scientific Name
Species Code
Legal Status
Exotic
2010 Update
Notes on Name / Synonym as used in source reference
Family
Group
Life form
Fireresponse
Comments on regeneration
Resprout location
Seed storage
Seed dispersal mechanism
Seed dispersal distance
Seed weight / size
Seed viability
Dormancy
Germination cue
Fecundity
Seed predation
Post-fire recruitment
Establishment
Post-fire flowering
Flowering time
Primary juvenile period
Secondary juvenile period
Seed set
Seed-bank developed
Fire tolerance
Life span
Seed-bank longevity
"Maturity" (from source)
"Extinction" (from source)
"Rec. min fire interval" (from source)
"Rec. max fire interval" (from source)
NC
CC
SC
NT
CT
ST
NWS
CWS
SWS
NWP
SWP
NFWP
SFWP
Distribution: extra NSW
Vegetation
Rainforest
Wet Sclerophyll Forest (Shrubby)
Wet Sclerophyll Forest (Grassy)
Grassy Woodland
Grassland
Dry Sclerophyll Forest (Shrub/Grass)
Dry Sclerophyll Forest (Shrubby)
Heathland
Alpine Complex
Freshwater Wetland
Forested Wetlands
Saline Wetla

## Dealing with hyperlinks

The cell Q6 has a hyperlink. We can use cell rows and columns or cell name:

In [8]:
type(ws.cell(row=6, column=17).hyperlink)
# same as 
type(ws['Q6'].hyperlink)

openpyxl.worksheet.hyperlink.Hyperlink

If the cell is a hyperlink it will have a value to "display" and will point to a "location" within the workbook: 

In [9]:
ws.cell(row=6, column=17).hyperlink.display

'viability average-very good'

In [10]:
# This will fail if there is no hyperlink 
print(ws.cell(row=6, column=17).hyperlink.location)

References!C94


Let's see the value of this reference:

In [11]:
hlink = ws.cell(row=6, column=17).hyperlink.location
hlink = hlink.split("!")

This gives the name of the target sheet and the corresponding cell. We need to read the cell to its right side (add one to the column number) to get the information we need.

In [12]:
ref = wb[hlink[0]]
print("Cell value is :: " + str(ref[hlink[1]].value))
nlink = ref.cell(row=ref[hlink[1]].row,column=ref[hlink[1]].col_idx + 1)

print("Reference data is :: " + nlink.value) 


Cell value is :: 93
Reference data is :: Mortlock, W. & Lloyd, MV (Eds) (2001) Floradata - A guide to collection, storage and propogation of Australian native plant seed. AUsttralian Centre for Mining Environmental Research, Brisbane; Australian National Botanic Gardens, CSIRO Forestry and Forest Products and Greening Australia Limited, Canberra. Searchable Database February 2001. a=survey data, b=test data


If there is no hyperlink, it will result in NoneType

In [13]:
type(ws.cell(row=5, column=17).hyperlink)

NoneType

In [14]:
type(ws.cell(row=5, column=17))


openpyxl.cell.cell.Cell

In [16]:
ws.cell(row=5, column=17)


<Cell 'SpeciesData'.Q5>

## Read data from a column
For a selected variable (column), we can query data for the list of species.


In [9]:
ws['Q6']

<Cell 'SpeciesData'.Q6>

In [10]:
ws.cell(row=2, column=17).value

'Seed viability'

In [11]:
ws.cell(row=6, column=17).value

'average-very good'

Example loop for querying values from one variable for all species in a range of cells:

In [12]:
# for i in 3:3088
# for j in 17
i=3
j=17
varname=ws.cell(row=2, column=j).value
print(varname)
for i in range(13,40):
    spname=ws.cell(row=i, column=1).value
    spcode=ws.cell(row=i, column=2).value
    varvalue=ws.cell(row=i, column=j).value
    varref=ws.cell(row=i, column=j).hyperlink
    if varvalue is not None:
        if varref is not None:
            print("%s: %s / %s / %s" % (spcode,spname,varvalue, varref.location))
        else:
            print("%s: %s / %s " % (spcode,spname,varvalue))


Seed viability
3710: Acacia baileyana / 0.96 / References!C94
3723: Acacia brownii / good / References!C94
3725: Acacia burbidgeae / 0.76 / References!C94
8242: Acacia burkittii / good 
3727: Acacia buxifolia / good / References!C94
3743: Acacia colletioides / 0.837 


## Read data for a row
We can now do the same for a single species (row) and query values of each variable in a range. For example:

In [13]:
# for j in 17
i=18
spname=ws.cell(row=i, column=1).value
spcode=ws.cell(row=i, column=2).value
print("%s: %s" %(spcode,spname))
    
for j in range(3,30):
    varname=ws.cell(row=2, column=j).value
    varvalue=ws.cell(row=i, column=j).value
    varref=ws.cell(row=i, column=j).hyperlink
    if varvalue is not None:
        if varref is not None:
            print("%s: %s / %s / %s" % (j,varname,varvalue, varref.location))
        else:
            print("%s: %s / %s " % (j,varname,varvalue))


3716: Acacia binervata
7: Family / Fabaceae: Mimosoideae 
8: Group / D 
9: Life form / T 
10: Fireresponse / Sr 
11: Comments on regeneration / Resprouting form in northern tablelands (134) 
12: Resprout location / basal buds 
13: Seed storage / persistent soil 
14: Seed dispersal mechanism / a-ant / References!C56
18: Dormancy / hard seed coat 
19: Germination cue / heat / References!C94
22: Post-fire recruitment / prolific / References!C106
23: Establishment / I / References!A94
25: Flowering time / Aug-Nov 
26: Primary juvenile period / 5 / References!C36
27: Secondary juvenile period / References!C36 / References!C36
28: Seed set / 6 / References!C36
29: Seed-bank developed / 10 / References!C36


## Search for a species code
Here we try to locate the species code and then return the values for that row:

In [14]:
for cell in ws['A']:
    if(cell.value is not None): #We need to check that the cell is not empty.
        if 'Actinotus helianthi' in cell.value: #Check if the value of the cell contains the text 'Table'
            print('Found header with name: {} at row: {} and column: {}. In cell {}'.format(cell.value,cell.row,cell.column,cell))

Found header with name: Actinotus helianthi at row: 179 and column: 1. In cell <Cell 'SpeciesData'.A179>


In [15]:
# for j in 17
i=17
spname=ws.cell(row=i, column=1).value
spcode=ws.cell(row=i, column=2).value
print("%s: %s" %(spcode,spname))
    
for j in range(3,50):
    varname=ws.cell(row=2, column=j).value
    varvalue=ws.cell(row=i, column=j).value
    varref=ws.cell(row=i, column=j).hyperlink
    if varvalue is not None:
        if varref is not None:
            print("%s: %s / %s / %s" % (j,varname,varvalue, varref.location))
        else:
            print("%s: %s / %s " % (j,varname,varvalue))


3715: Acacia betchei
7: Family / Fabaceae: Mimosoideae 
8: Group / D 
9: Life form / S 
10: Fireresponse / S 
13: Seed storage / persistent soil 
18: Dormancy / hard seed coat 
19: Germination cue / scarification / References!C94
25: Flowering time / Nov-Feb 
37: NC / - 
38: CC / - 
39: SC / - 
40: NT / 1 
41: CT / - 
42: ST / - 
43: NWS / 1 
44: CWS / - 
45: SWS / - 
46: NWP / - 
47: SWP / - 
48: NFWP / - 
49: SFWP / - 


## Populate the 'blue table'
Select one column, translate the values to the accepted range and update entries into the table

In [16]:
from configparser import ConfigParser

filename = repodir / 'secrets' / 'database.ini'
section = 'aws-lght-sl'

# create a parser
parser = ConfigParser()
# read config file
parser.read(filename)

# get section, default to postgresql
db = {}
if parser.has_section(section):
    params = parser.items(section)
    for param in params:
        db[param[0]] = param[1]
else:
    raise Exception('Section {0} not found in the {1} file'.format(section, filename))

In [17]:
import psycopg2
params = db

In [23]:
# for i in 3:3088
# for j in 10
row_count = ws.max_row
i=3
j=10
varname=ws.cell(row=2, column=j).value
print(varname)

switcher={
    "S": "none",
    "Sr": "few",
    "S/R": "half",
    "Rs": "most",
    "R": "all"
}

# connect to the PostgreSQL server
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(**params)


qrystr= "INSERT INTO litrev.traits(species,species_code,resprouting) values('%s',%s,'%s') ON CONFLICT (species,species_code) DO UPDATE SET resprouting=EXCLUDED.resprouting"
for i in range(3,row_count):
    spname=ws.cell(row=i, column=1).value
    spcode=ws.cell(row=i, column=2).value
    varvalue=ws.cell(row=i, column=j).value
    varref=ws.cell(row=i, column=j).hyperlink
    if varvalue is not None:
        transvalue=switcher.get(varvalue, "unknown")
        if (isinstance(spcode, str) and spcode.isnumeric()) or isinstance(spcode,int):
            cur = conn.cursor()
            cur.execute(qrystr % (spname,spcode,transvalue))
            updated_rows = cur.rowcount
            if updated_rows > 0:
                if varref is not None:
                    print("%s: %s / %s / %s" % (spcode,spname,transvalue, varref.location))
                else:
                    print("%s: %s / %s " % (spcode,spname,transvalue))
                print("%s rows updated" % (updated_rows))
            conn.commit()
            cur.close()

if conn is not None:
    conn.close()
    print('Database connection closed.')

        

Fireresponse
Connecting to the PostgreSQL database...
7184: Abutilon otocarpum / none 
1 rows updated
3632: Abutilon oxycarpum / none 
1 rows updated
3698: Acacia acanthoclada / none 
1 rows updated
3699: Acacia acinacea / none 
1 rows updated
3700: Acacia aculeatissima / none 
1 rows updated
3701: Acacia adunca / none 
1 rows updated
3702: Acacia alpina / none 
1 rows updated
3705: Acacia aneura / few 
1 rows updated
7581: Acacia aulacocarpa / all 
1 rows updated
3708: Acacia ausfeldii / none 
1 rows updated
3710: Acacia baileyana / none 
1 rows updated
3712: Acacia barringtonensis / all 
1 rows updated
7060: Acacia baueri subsp. baueri / none 
1 rows updated
11039: Acacia beadleana / all 
1 rows updated
3715: Acacia betchei / none 
1 rows updated
3716: Acacia binervata / few 
1 rows updated
3717: Acacia binervia / none 
1 rows updated
10788: Acacia blakei subsp. diphylla / none 
1 rows updated
10788: Acacia blakei subsp. diphylla / none 
1 rows updated
8381: Acacia blayana / none 
1 

SyntaxError: syntax error at or near "helmsii"
LINE 1: ...ecies_code,resprouting) values('Senna form taxon 'helmsii'',...
                                                             ^


In [35]:
int(spcode)

6354

In [36]:
j=12
varname=ws.cell(row=2, column=j).value
print(varname)


switcher={
    "epicormic": "epicormic",
    "stem buds": "epicormic",
    "apical": "apical",
    "lignotuber": "lignotuber", 
    "rootstock": "lignotuber", 
    "root stock": "lignotuber",
    "basal": "basal",
    "basal buds": "basal",
    "coppice": "basal",
    "tuber": "tuber",
    "taproot": "tuber",
    "tussock": "tussock",
    "rhizome": "short rhizome",
    "root sucker": "long rhizome or root sucker",
    "root suckers": "long rhizome or root sucker",
    "rootsucker": "long rhizome or root sucker",
    "root buds": "long rhizome or root sucker",
    "stolon": "stolon",
    "stolons": "stolon"
}
# connect to the PostgreSQL server
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(**params)
qrystr= "INSERT INTO litrev.traits(species,species_code,regenerative_organ) values(%s,%s,%s) ON CONFLICT (species,species_code) DO UPDATE SET regenerative_organ=EXCLUDED.regenerative_organ"
updated_rows=0
for i in range(3,row_count):
    spname=ws.cell(row=i, column=1).value
    spcode=ws.cell(row=i, column=2).value
    varvalue=ws.cell(row=i, column=j).value
    varref=ws.cell(row=i, column=j).hyperlink
    if varvalue is not None:
        transvalue=switcher.get(varvalue, "other")
        #if varref is not None:
        #    print("%s: %s / (%s) %s / %s" % (spcode,spname,varvalue,transvalue, varref.location))
        #else:
        #    print("%s: %s / (%s) %s " % (spcode,spname,varvalue,transvalue))
        if spcode.isnumeric():
            cur = conn.cursor()
            cur.execute(qrystr, (spname,int(spcode),transvalue))
            updated_rows = updated_rows + cur.rowcount
        #    print("%s rows updated" % (updated_rows))
            conn.commit()
            cur.close()
print("Total of %s rows updated" % (updated_rows))
        
if conn is not None:
    conn.close()
    print('Database connection closed.')

        

Resprout location
Connecting to the PostgreSQL database...
Total of 1242 rows updated
Database connection closed.


In [39]:
j=13
varname=ws.cell(row=2, column=j).value
print(varname)


switcher={
    "canopy":"canopy",
    "canopy - transient":"canopy",
    "persistent":"soil-persistent",
    "peristent":"soil-persistent",
    "persistent soil":"soil-persistent", 
    "a-persistent soil":"soil-persistent", 
    "soil":"soil-persistent",
    "transient":"transient", 
    "none":"transient",
    "shed at maturity":"transient", 
    "viviparous":"transient",
    "canopy / released at maturity":"transient",
    "canopy / regularly without fire":"transient",
    "not canopy":"non-canopy",
}

# connect to the PostgreSQL server
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(**params)
qrystr= "INSERT INTO litrev.traits(species,species_code,seedbank_type) values(%s,%s,%s) ON CONFLICT (species,species_code) DO UPDATE SET seedbank_type=EXCLUDED.seedbank_type"
updated_rows = 0

for i in range(1300,2800):
    spname=ws.cell(row=i, column=1).value
    spcode=ws.cell(row=i, column=2).value
    varvalue=ws.cell(row=i, column=j).value
    varref=ws.cell(row=i, column=j).hyperlink
    if varvalue is not None:
        transvalue=switcher.get(varvalue, "other")
        #if varref is not None:
        #    print("%s: %s / (%s) %s / %s" % (spcode,spname,varvalue,transvalue, varref.location))
        #else:
        #    print("%s: %s / (%s) %s " % (spcode,spname,varvalue,transvalue))
        if spcode.isnumeric():
            cur = conn.cursor()
            cur.execute(qrystr, (spname,int(spcode),transvalue))
            updated_rows = updated_rows + cur.rowcount
            conn.commit()
            cur.close()
print("Total of %s rows updated" % (updated_rows))


if conn is not None:
    conn.close()
    print('Database connection closed.')
            

Seed storage
Connecting to the PostgreSQL database...
Total of 2660 rows updated
Database connection closed.


In [42]:
j=22
varname=ws.cell(row=2, column=j).value
print(varname)

switcher={
    "prolific":"abundant",
    "abundant":"abundant",
    "common":"abundant",
    "high":"abundant",
    "substantial":"abundant",
    "vigourous":"abundant",
    "1-4 seedlings/m2":"abundant",
    "no":"absent",
    "none observed":"absent"
}

# connect to the PostgreSQL server
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(**params)
qrystr= "INSERT INTO litrev.traits(species,species_code,postfire_seedling_recruitment) values(%s,%s,%s) ON CONFLICT (species,species_code) DO UPDATE SET postfire_seedling_recruitment=EXCLUDED.postfire_seedling_recruitment"
cur = conn.cursor()
updated_rows = 0
for i in range(3,3000):
    spname=ws.cell(row=i, column=1).value
    spcode=ws.cell(row=i, column=2).value
    varvalue=ws.cell(row=i, column=j).value
    varref=ws.cell(row=i, column=j).hyperlink
    if varvalue is not None:
        transvalue=switcher.get(varvalue, "present")
        #if varref is not None:
        #    print("%s: %s / (%s) %s / %s" % (spcode,spname,varvalue,transvalue, varref.location))
        #else:
        #    print("%s: %s / (%s) %s " % (spcode,spname,varvalue,transvalue))
        if spcode.isnumeric():
            cur.execute(qrystr, (spname,int(spcode),transvalue))
            updated_rows = updated_rows + cur.rowcount
            
conn.commit()
cur.close()            
print("Total of %s rows updated" % (updated_rows))

if conn is not None:
    conn.close()
    print('Database connection closed.')
        

Post-fire recruitment
Connecting to the PostgreSQL database...
Total of 590 rows updated
Database connection closed.
