In [127]:
import openpyxl
from pathlib import Path
import os
from datetime import datetime
from configparser import ConfigParser
import psycopg2
from psycopg2.extras import DictCursor
from psycopg2.extensions import AsIs
import re
import pandas as pd
import copy
import numpy as np

In [159]:
repodir = Path("../../") 

filename = repodir / 'secrets' / 'database.ini'
section = 'aws-lght-sl'

parser = ConfigParser()
parser.read(filename)

dbparams = {}
if parser.has_section(section):
    params = parser.items(section)
    for param in params:
        dbparams[param[0]] = param[1]
else:
    raise Exception('Section {0} not found in the {1} file'.format(section, filename))

In [181]:
inputdir = repodir / "data/"
os.listdir(inputdir)
#BioNET = pd.read_excel(inputdir / 'vis-survey-datasheet-6000.PowerQuery.20210708.xlsx')

['NSWmap_v3_key3.xlsx',
 'output-report',
 '.DS_Store',
 'austraits',
 'NSWFFRDv2.1.xlsx',
 'trait-method-vocabulary+DK.xlsx',
 'field-form',
 '.ipynb_checkpoints']

In [None]:

def match_spcode(row):
    spname=row['taxon_name']
    altname=row['original_name']
    result={'species':spname}
    if altname!=spname:
        result['original_notes']=['original_name:',altname]
    spp_info = BioNET[BioNET['scientificName'] == spname] 
    spcode=None
    if len(spp_info)==1 and spp_info.speciesCode_Synonym is not None:
        spcode=spp_info.speciesCode_Synonym.values[0]
        result['species_code']=spcode
    elif spname != altname:
        spp_info = BioNET[BioNET['scientificName'] == altname]
        if len(spp_info)==1 and spp_info.speciesCode_Synonym is not None:
            spcode=spp_info.speciesCode_Synonym.values[0]
            result['species_code']=spcode
            result['original_notes'].append('original name used to match with BioNET names')
 
    return result


## Seed dormancy data

### Add references

In [161]:
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(**dbparams)
cur = conn.cursor()
affected_rows=0

refs={'Kenny 2003':'Kenny 2003 PhD Thesis UTS',
      'Hoyle 1998':'Hoyle 1998',
      'Copete etal 2011':'Copete etal 2011 Seed Science Research',
      '{Australian National Botanic Gardens} 2018':'ANBG website',
      'Seeds of South Australia website':'Seeds of South Australia website',
     'Baskin & Baskin 2014':'Baskin, C. and Baskin, J.M. (2014) Seeds: Ecology, Biogeography, and Evolution of Dormancy and Germination. Academic Press, San Diego, 150-162.',
     'Vening etal 2017':'Vening etal 2017 Aust J Bot',
     'Myerscough 1998':'Myerscough 1998 Cunninghamia',
     'Clarke et al 2000':'Clarke et al 2000',
     'Davila & Wardle 2002':'Davila & Wardle 2002 Aust J Bot'}

affected_rows=0

for id in refs.keys():
    ins="""INSERT INTO litrev.ref_list(ref_code,ref_cite) values (%s,%s) ON CONFLICT DO NOTHING"""
    #print(ins % (id,refs[id]))
    cur.execute(ins,(id,refs[id]))
    affected_rows = affected_rows+cur.rowcount

conn.commit()
print("total number of lines updated: %s" % affected_rows)
cur.close()
if conn is not None:
    conn.close()
    print('Database connection closed.')    


Connecting to the PostgreSQL database...
total number of lines updated: 0
Database connection closed.


### import data from `austraits build` repo

In [162]:
aus_repodir = Path("/Users/jferrer/proyectos/externos/austraits.build") 
inputdir = aus_repodir / "data" / "Ooi_2007" / "raw"
os.listdir(inputdir)

['Ooi_coordinates.csv', 'Ooi_seed weight_dorm type_All spp for AusTraits.csv']

In [163]:
df = pd.read_csv(inputdir / 'Ooi_seed weight_dorm type_All spp for AusTraits.csv', encoding='windows-1252')

df['Source dormancy type']=df['Source dormancy type'].str.replace('Baskin & Baskin ([0-9]+)','Baskin & Baskin 2014',regex=True)
df

Unnamed: 0,FAMILY,SPECIES,SITE,DATE OF COLLECTION,DATE OF TRIAL,SEED NUMBER,SEED WEIGHT mg,DORM_TYPE,Source dormancy type
0,Fabaceae,Acacia binervata,Fredericktown,21/11/1978,2/02/1979,1,17.600,PY,
1,Fabaceae,Acacia binervata,Fredericktown,21/11/1978,2/02/1979,2,13.240,PY,
2,Fabaceae,Acacia binervata,Fredericktown,21/11/1978,2/02/1979,3,13.400,PY,
3,Fabaceae,Acacia binervata,Fredericktown,21/11/1978,2/02/1979,4,10.240,PY,
4,Fabaceae,Acacia binervata,Fredericktown,21/11/1978,2/02/1979,5,18.460,PY,
...,...,...,...,...,...,...,...,...,...
14742,Xanthorrhoeaceae,Xanthorrhoea media,Awaba,30/07/1978,22/08/1978,96,9.852,ND/PD,inferred Ooi expert knowledge
14743,Xanthorrhoeaceae,Xanthorrhoea media,Awaba,30/07/1978,22/08/1978,97,8.664,ND/PD,inferred Ooi expert knowledge
14744,Xanthorrhoeaceae,Xanthorrhoea media,Awaba,30/07/1978,22/08/1978,98,9.400,ND/PD,inferred Ooi expert knowledge
14745,Xanthorrhoeaceae,Xanthorrhoea media,Awaba,30/07/1978,22/08/1978,99,13.580,ND/PD,inferred Ooi expert knowledge


In [164]:
df.groupby(['SPECIES','DORM_TYPE'], as_index=False).agg({'SEED NUMBER':pd.Series.nunique})

Unnamed: 0,SPECIES,DORM_TYPE,SEED NUMBER
0,Acacia binervata,PY,100
1,Acacia caesiella,PY,100
2,Acacia elata,PY,100
3,Acacia elongata,PY,100
4,Acacia falcata,PY,100
...,...,...,...
174,Trachymene incisa var incisa,MPD,100
175,Vachellia farnesiana,PY,100
176,Westringia fruticosa,PD,21
177,Woollsia pungens,PD,100


In [165]:
df.groupby(['DORM_TYPE'], as_index=False).agg({'SPECIES':pd.Series.nunique})

Unnamed: 0,DORM_TYPE,SPECIES
0,MPD,11
1,ND,119
2,ND/PD,3
3,PD,12
4,PY,34


In [166]:
df.groupby(['Source dormancy type'], as_index=False).agg({'SPECIES':pd.Series.nunique})

Unnamed: 0,Source dormancy type,SPECIES
0,ANBG website,1
1,Baskin & Baskin 2014 inferred,1
2,Baskin & Baskin 2014 inferred from other studies,1
3,Seeds of South Australia website,3
4,Seeds of South Australia website/Baskin & Bask...,1
5,Vening etal 2017 Aust J Bot,1
6,inferred Myerscough 1998 Cunninghamia,3
7,inferred Ooi - needs after-ripening,1
8,inferred Ooi expert knowledge,6
9,inferred by Copete etal 2011 Seed Science Rese...,1


In [167]:
df1 = df[['FAMILY','SPECIES','DORM_TYPE','Source dormancy type']]
df1=df1.drop_duplicates()
df1['main_ref']='Ooi et al. 2007'
df1['original_sources']=np.NaN

df1['methods']='Unspecified methods'



In [168]:
for k in refs.keys():
    ff = df1['Source dormancy type'].str.contains(k)>0
    df1.loc[ff,'original_sources']=k


In [169]:
methods={'inferred':'Inferred from plant / organ / growth stage morphology',
         'related species':'Inferred from related taxa',
        'inferred from related':'Inferred from related taxa'}
for k in methods.keys():
    ff = df1['Source dormancy type'].str.contains(k)>0
    df1.loc[ff,'methods']=methods[k]


In [170]:
df1

Unnamed: 0,FAMILY,SPECIES,DORM_TYPE,Source dormancy type,main_ref,original_sources,methods
0,Fabaceae,Acacia binervata,PY,,Ooi et al. 2007,,Unspecified methods
100,Fabaceae,Acacia caesiella,PY,,Ooi et al. 2007,,Unspecified methods
200,Fabaceae,Acacia elata,PY,,Ooi et al. 2007,,Unspecified methods
300,Fabaceae,Acacia elongata,PY,,Ooi et al. 2007,,Unspecified methods
400,Fabaceae,Acacia falcata,PY,,Ooi et al. 2007,,Unspecified methods
...,...,...,...,...,...,...,...
14399,Apiaceae,Trachymene incisa var incisa,MPD,inferred from Davila & Wardle 2002 Aust J Bot,Ooi et al. 2007,Davila & Wardle 2002,Inferred from plant / organ / growth stage mor...
14499,Myrtaceae,Tristaniopsis laurina,,,Ooi et al. 2007,,Unspecified methods
14526,Lamiaceae,Westringia fruticosa,PD,inferred Ooi expert knowledge,Ooi et al. 2007,,Inferred from plant / organ / growth stage mor...
14547,Ericaceae,Woollsia pungens,PD,inferred Ooi expert knowledge,Ooi et al. 2007,,Inferred from plant / organ / growth stage mor...


In [182]:
def create_record(row):
    #refid=row['source_ref']
    val = row['DORM_TYPE']
    records=list()
    w = 0
    wnote = "incomplete record"
    notes = list()
    record={'main_source': 'Ooi Myerscough Auld 2007',
            'species':row['SPECIES'],
            'additional_notes': ['Direct match of categories',
                                'Automatic extraction with python script'],
            'raw_value': ['DORM_TYPE',val]}
    if row['Source dormancy type'] != "nan":
    #    notes.append('site name:')
        notes.append(row['Source dormancy type'])
    if row['original_sources'] != "nan":
        record['original_sources']=[row['original_sources'],]
    if row['methods'] != "nan":
        record['method_of_estimation']=row['methods']

    w=1
    wnote="default of 1"
    #spinfo=match_spcode(row)
    #for key in spinfo.keys():
    #    record[key]=spinfo[key]
    record['weight']=w
    record['weight_notes'] = ["python-script import",wnote]
    
    if len(notes)>0:
        record['original_notes']=notes
        
    for sw in val.split("/"):
        indrecord=copy.deepcopy(record)
        sw=sw.strip(" ")
        transvalue=switcher.get(sw, None)
        if sw != val:
            indrecord['raw_value'].extend(['->',sw])            
        if transvalue is not None:   
            indrecord["norm_value"]=transvalue
        records.append(indrecord)
    return(records)

In [189]:
df1.fillna("nan",inplace=True)
#target = ATtraits[ss]
target=df1
reflist=list()
records=list()
switcher={
        "ND": "ND",
        'PD': "PD",
        'MPD': 'MPD', 
        'PY': 'PY'
    }
for idx, row in target.iterrows():
    record=create_record(row)
  
    records.extend(record)

len(records)

194

In [190]:
records[19]

{'main_source': 'Ooi Myerscough Auld 2007',
 'species': 'Angophora hispida',
 'additional_notes': ['Direct match of categories',
  'Automatic extraction with python script'],
 'raw_value': ['DORM_TYPE', 'ND'],
 'method_of_estimation': 'Unspecified methods',
 'weight': 1,
 'weight_notes': ['python-script import', 'default of 1'],
 'norm_value': 'ND'}

In [191]:
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(**dbparams)
cur = conn.cursor()
affected_rows=0

        
insert_statement = 'insert into litrev.germ8 (%s) values %s ON CONFLICT DO NOTHING'
print("total of %s records prepared" % len(records)) 
for record in records: 
    cur.execute(insert_statement, (AsIs(','.join(record.keys())), tuple(record.values())))
    affected_rows = affected_rows+cur.rowcount
records.clear()
conn.commit()
print("total number of lines updated: %s" % affected_rows)

cur.close()
if conn is not None:
    conn.close()
    print('Database connection closed.')    

Connecting to the PostgreSQL database...
total of 194 records prepared
total number of lines updated: 194
Database connection closed.


In [192]:
conn = psycopg2.connect(**dbparams)
cur = conn.cursor()
qry = """
SELECT "scientificName","speciesCode_Synonym",species 
FROM litrev.germ8 g 
LEFT JOIN species.caps a 
ON g.species=a."scientificName" 
WHERE species IS NOT NULL;
"""
cur.execute(qry)
res=cur.fetchall()
cur.close()
if conn is not None:
    conn.close()
    print('Database connection closed.')    

Database connection closed.


In [196]:
conn = psycopg2.connect(**dbparams)
cur = conn.cursor()
upd="UPDATE litrev.germ8 SET species_code=%s::int WHERE species=%s"
for record in res:
    if record[0] is not None and record[0]==record[2]:
        cur.execute(upd, (record[1],record[2]))
cur.close()
conn.commit()
if conn is not None:
    conn.close()
    print('Database connection closed.') 

Database connection closed.


## Victoria VA database

In [197]:
inputdir = aus_repodir / "data" / "White_2020" / "raw"
os.listdir(inputdir)

['White_2020_raw_data.csv',
 'taxa_without_substitutions.csv',
 'White_2020 custom R code.R',
 'White_2020_taxonomic_changes_part3.csv',
 'White_2020_taxonomic_changes_part1.csv',
 'White_2020_taxonomic_changes_part2_genera.csv']

In [199]:
df = pd.read_csv(inputdir / 'White_2020_raw_data.csv', encoding='windows-1252')
df

Unnamed: 0,AttributeSppId,Vic_Sp_No,NSW_Sp_No,SA_Sp_No,Species_Name,Family,Genus,NonIndigenous,Mat<1,Mat1-5,...,LL_TRF,LL_TTG,MM_F,MM_S,MM_EUCT,MM_N_EUC,MM_TG,MM_HG,MM_GF,MM_GRAM
0,504703,4703.0,0,0.0,Lobelia gibbosa sensu Walsh,Campanulaceae,Lobelia,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,500001,1.0,0,0.0,Abrotanella nivigena,Asteraceae,Abrotanella,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-311,0.0,0,0.0,Abrotanella spp.,Asteraceae,Abrotanella,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,603625,0.0,0,0.0,Abutilon calliphyllum,Malvaceae,Abutilon,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,603626,,,,Abutilon cryptopetalum,Malvaceae,Abutilon,,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8632,702863,0.0,0,2863.0,Zygophyllum prismatothecum,Zygophyllaceae,Zygophyllum,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8633,710359,,,,Zygophyllum reticulatum,Zygophyllaceae,Zygophyllum,,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8634,504116,4116.0,0,0.0,Zygophyllum simile,Zygophyllaceae,Zygophyllum,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8635,507479,7479.0,0,0.0,Zygophyllum sp. aff. ammophilum,Zygophyllaceae,Zygophyllum,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [206]:
df.columns.values.tolist()


['AttributeSppId',
 'Vic_Sp_No',
 'NSW_Sp_No',
 'SA_Sp_No',
 'Species_Name',
 'Family',
 'Genus',
 'NonIndigenous',
 'Mat<1',
 'Mat1-5',
 'Mat5-20',
 'Mat>20',
 'Long<1',
 'Long1-10',
 'Long10-50',
 'Long50-500',
 'Long>500',
 'Fire_not_rel',
 'Fire_K',
 'Fire_NK',
 'SeedS_ser',
 'SeedS_mat',
 'SeedS_grad',
 'Seed<2',
 'Seed>2',
 'Geophyte',
 'Proliferate',
 'Resprout',
 'Snow_K',
 'Snow_intol',
 'Snow_day',
 'Snow_week',
 'Snow_month',
 'Aquat',
 'WL K ',
 'WL_<1',
 'WL_1-6',
 'WL_>6',
 'Inun_not_confronted',
 'Inun_<1',
 'Inun_1-6',
 'Inun_>6_ni',
 'Inun_perm',
 'Median_palat',
 'C1',
 'C2',
 'C3',
 'D1',
 'D2',
 'D3',
 'G1',
 'G2',
 'G3',
 'M1',
 'M2',
 'M3',
 'R1',
 'R2',
 'R3',
 'S1',
 'S2',
 'S3',
 'B1',
 'B2',
 'Max_palat',
 'Poll_rare',
 'Poll_Abiotic',
 'Poll_invert',
 'Poll_Vert',
 'Disp_Pass',
 'Disp_Water',
 'Disp_Wind',
 'Disp_Mam(int)',
 'Disp_Mam(ext)',
 'Disp_Bird(int)',
 'Disp_Bird(ext)',
 'Disp_Rep',
 'Disp_Invert(ext)',
 'Disp_Vege',
 'Mesic_herb',
 'Inch_High',
 'Ph

In [210]:
df['Resprout'].unique()

array([ 0.,  1., nan])