In [None]:
import numpy as np
import pandas as pd
import pyodbc
import csv
import pyworms

# DEBUG
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from warnings import filterwarnings
filterwarnings("ignore", category=UserWarning, message='.*')

# Output file path
outdir = "D:\\00-GCOOS\\00-MBON\\CAGES\\FL\\data\\"

# THE ORIGINAL CAGES DATABASE FILE is served via WAF but cannot be queried from there
# -> download to local HD
#db_file = 'https://gcoos4.geos.tamu.edu/WAF/MBON/CAGES/CAGES.accdb'
db_file = "D:\\00-GCOOS\\00-MBON\\CAGES_ORIG\CAGES.accdb"
user = 'user'
password = 'pw'
# open the database connection:
cnxn = pyodbc.connect('DRIVER={{Microsoft Access Driver (*.mdb, *.accdb)}};DBQ={};Uid={};Pwd={};'.format(db_file, user, password))

### GET DATA TABLES

# Get joined Stations and Physical tables
# --
query = 'SELECT st.*, ph.[Start Depth], ph.[End Depth], ph.[Distance Towed], ph.[Secchi Depth], ph.[Soak Time] FROM "Florida Stations Fixed" AS st LEFT JOIN "Florida Physical" AS ph ON \
st.[Station Code] = ph.[Station Code]'
st_ph = pd.read_sql(query, cnxn)
# Soak time got converted to datetime with year and all... Source just had hours:minutes, with hours as zero. 
# -> First drop lines where soak time not given 
st_ph.drop(st_ph[st_ph['Soak Time'].isna()].index, inplace=True)
# -> Get just minutes
st_ph['TowTime'] = st_ph['Soak Time'].dt.minute.astype('Int64')
# Drop Soak Time 
st_ph.drop(columns=['Soak Time'], inplace=True)

# Get Hydrological data
# --
query = 'SELECT * FROM "Florida Hydrological"'
hydro = pd.read_sql(query, cnxn)
# Get lowest depth (surface) and deepest depth (bottom) values.
# In source data they are in order from smallest to biggest depth -> use first, last
hydro_surface = hydro.groupby('Station Code').agg({'Depth': 'first', 'Temperature': 'first', 'Conductivity': 'first', 'pH': 'first', 'Salinity': 'first', 'DO': 'first'}).reset_index()
hydro_bottom = hydro.groupby('Station Code').agg({'Depth': 'last', 'Temperature': 'last', 'Conductivity': 'last', 'pH': 'last', 'Salinity': 'last', 'DO': 'last'}).reset_index()
# Rename lowest depth/surface (Depth1) and deepest depth/bottom (Depth2) columns for merging
for acol in hydro_surface.columns:
    if 'Station' in acol:
        continue
    else:
        hydro_surface = hydro_surface.rename(columns={acol: acol + '1'})
for acol in hydro_bottom.columns:
    if 'Station' in acol:
        continue
    else:
        hydro_bottom = hydro_bottom.rename(columns={acol: acol + '2'})

# MERGE surface and bottom values ...
merged_hydro = pd.merge(hydro_surface, hydro_bottom, on=['Station Code'])

# Merge the hydro data to the stations + physical data
st_ph = pd.merge(st_ph, merged_hydro, on=['Station Code'],how='left')
st_ph.rename(columns={'Bay code': 'Bay Code'}, inplace=True)

# Get joined CPUE and Species tables
query = 'SELECT cp.*, sp.[Scientific Name], sp.[Common Name] FROM "Florida CPUE Fixed" AS cp LEFT JOIN "Florida Species" AS sp ON \
cp.[Species Code] = sp.[Species Code]'
cpue_specs = pd.read_sql(query, cnxn)

# Convert ScientificName from NoneType to string:
cpue_specs.loc[:, 'ScientificName'] = cpue_specs['Scientific Name'].astype('str')
# Convert Species Code from float to integer:
cpue_specs.loc[:, 'Species Code'] = cpue_specs['Species Code'].astype('Int64')
# Drop the rows where the scientific name not known (Missing in the source files)
cpue_specs.drop(cpue_specs[cpue_specs['ScientificName'] == 'None'].index, inplace=True)
# drop 'Scientific Name'
cpue_specs.drop('Scientific Name', axis=1, inplace=True)

# merge cpue_specs and stations + physical + hydrological
cpue_specs = pd.merge(cpue_specs, st_ph, on=['Station Code','YYYY','MM','DD','Bay Code'],how='left')
#cpue_specs.loc['TowTime']

# Trawls and Lengths
# --
query = 'SELECT tr.[Station Code], tr.[Species Code], tr.[Number] AS TotalNumber, len.[Number] AS NumberMeasured, len.[Length] \
FROM "Florida Trawls" AS tr LEFT JOIN "Florida Lengths" AS len ON tr.[Station Code] = len.[Station Code] AND tr.[Species Code] = len.[Species]'
tr_len = pd.read_sql(query, cnxn)

intcols = ['Species Code', 'NumberMeasured', 'Length']
for acol in intcols:
    tr_len.loc[:, acol] = tr_len[acol].astype('Int64')

# MERGE cpue_specs and tr_len
df = pd.merge(cpue_specs, tr_len, on=['Station Code','Species Code'],how='left')


# Get rid of spaces in column names
for acol in df.columns.to_list():
    df.rename({acol: acol.replace(" ","")}, axis="columns", inplace=True)

#Convert StationCode from NoneType to string
df.loc[:, 'StationCode'] = df['StationCode'].astype('Int64')

# Generate Datetime field
df['datestr'] = df['YYYY'].astype(str) + '-' + df['MM'].astype(str) + '-' + df['DD'].astype(str) + ' 12:00'
df['Datetime'] = pd.to_datetime(df['datestr'], utc=True)

# drop columns that are not needed for output
df.drop(['datestr', 'YYYY', 'MM', 'DD'], axis=1, inplace=True)

# sort by date
df.sort_values(['Datetime'], axis=0, ascending=True, inplace=True, ignore_index=True)
# Format datetime string
df['Datetime'] = df['Datetime'].dt.strftime('%Y-%m-%dT%H:%MZ')

# Write merged data out to a .csv file
cols_out = ['Datetime','Latitude','Longitude','StationCode','BayCode','SpeciesCode','cpue','ScientificName','CommonName','ReferenceCode','TotalNumber','NumberMeasured','Length','StartDepth','EndDepth','DistanceTowed','TowTime','SecchiDepth','Depth1','Temperature1','Conductivity1','pH1','Salinity1','DO1','Depth2','Temperature2','Conductivity2','pH2','Salinity2','DO2']
df[cols_out].to_csv(outdir + "CAGES_CPUE_FL.csv", encoding='utf-8', index=False)

# Close the database connection
cnxn.close()

In [None]:
### Generate ERDDAP datasets .XML -snippet using templates according to variable type

## Paths
# ouput
xml_output = outdir + "CAGES_FL_XML.txt"
# template files
xml_header_temp = "D:\\00-GCOOS\\00-MBON\\CAGES\\erdxml\\" + "cages_header_template_FL.txt"
xml_int_temp = "D:\\00-GCOOS\\00-MBON\\CAGES\\erdxml\\" + "int_xml_template.txt"
xml_float_temp = "D:\\00-GCOOS\\00-MBON\\CAGES\\erdxml\\" + "float_xml_template.txt"
xml_string_temp = "D:\\00-GCOOS\\00-MBON\\CAGES\\erdxml\\" + "string_xml_template.txt"

# get types
dt = df.dtypes.to_dict()
int_cols = []
float_cols = []
string_cols = []

# list types
for akey in dt.keys():
     if 'nt64' in str(dt[akey]):
        int_cols.append(akey)
     elif 'float' in str(dt[akey]):
         float_cols.append(akey)
     elif 'obj' in str(dt[akey]) or 'str' in str(dt[akey]):
         string_cols.append(akey)

# These belong to the header snippet (no need to generate separately):
header_vars = ['Datetime', 'Latitude', 'Longitude']

# START concatenating the tamplate snippets together
# --

# 1st, write the header to the output file
with open(xml_output, "w") as output_file:
    with open(xml_header_temp, "r") as file:
        output_file.write(file.read())

# Loop through the rest
for acol in cols_out:
    isPhysicalMeasurement = False
    if acol in header_vars:
        continue
    elif acol in int_cols:
        template_file = xml_int_temp
    elif acol in float_cols:
        template_file = xml_float_temp
        isPhysicalMeasurement = True
    else:
        template_file = xml_string_temp
    
    # Open the xml template file for reading    
    with open(template_file, 'r') as tempfile:
        # Read the contents of the file
        contents = tempfile.read()
        # Modify the contents as needed
        mod_contents = contents.replace('_VARNAME_', acol)
        
        # Try to figure out the units for some
        if isPhysicalMeasurement:
            if 'temperature' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'degree_Celcius')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Temperature')
                if 'air' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'air_temperature')
                elif 'surface' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_surface_temperature')
                elif 'bottom' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_temperature_at_sea_floor')
                else:
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_temperature')
            elif 'turbidity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'NTU')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Turbidity')
                mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_turbidity')
            elif 'salinity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'PSU')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Salinity')
                if 'surface' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_surface_salinity')
                elif 'bottom' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_salinity_at_sea_floor')
                else:
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_salinity')
            #sea_water_electrical_conductivity
            elif 'conductivity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'S m-1')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Conductivity')
                mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_electrical_conductivity')
            elif 'DO' in acol or 'dissolved' in acol.lower() or 'oxygen' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'mg l-1')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Dissolved O2')
                mod_contents = mod_contents.replace('_DESTNAME_', 'mass_concentration_of_oxygen_in_sea_water')
            else:
                mod_contents = mod_contents.replace('_DESTNAME_',acol)

    # Open the ouput file for appending
    with open(xml_output, 'a') as file2:
        # Write the modified contents back to the file
        file2.write(mod_contents)

### Add the closing tag to the output xml file:
with open(xml_output, 'a') as output_file:
    output_file.write("</dataset>")