In [1]:
import numpy as np
import pandas as pd
import pyodbc
import csv
import pyworms

# DEBUG
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from warnings import filterwarnings
filterwarnings("ignore", category=UserWarning, message='.*')

# Output file path
outdir = "D:\\00-GCOOS\\00-MBON\\CAGES\\AL\\data\\merged\\"

# THE ORIGINAL CAGES DATABASE FILE is served via WAF but cannot be queried from there
# -> download to local HD
#db_file = 'https://gcoos4.geos.tamu.edu/WAF/MBON/CAGES/CAGES.accdb'
db_file = "D:\\00-GCOOS\\00-MBON\\CAGES_ORIG\CAGES.accdb"
user = 'user'
password = 'pw'
# open the database connection:
cnxn = pyodbc.connect('DRIVER={{Microsoft Access Driver (*.mdb, *.accdb)}};DBQ={};Uid={};Pwd={};'.format(db_file, user, password))

### GET DATA TABLES

# -- CPUE and Stations
query = 'SELECT * FROM "Alabama CPUE" AS cpue LEFT JOIN "Alabama Stations" AS stations ON \
cpue.[Station Code] = stations.[Station Code]'
cpuedata = pd.read_sql(query, cnxn)
# Remove duplicate columns
cpuedata = cpuedata.loc[:, ~cpuedata.columns.duplicated(keep='first')]

# -- TRAWLS and SPECIES
query = 'SELECT * FROM "Alabama Trawls" AS trawls LEFT JOIN "Alabama Species" AS species ON \
trawls.[Species Code] = species.[Species Code]'
trawlsdata = pd.read_sql(query, cnxn)

# Remove duplicate columns
trawlsdata = trawlsdata.loc[:, ~trawlsdata.columns.duplicated(keep='first')]

# Convert Species Code back to integer
intcols = ['Species Code']
for acol in intcols:
    #trawlsdata[acol] = trawlsdata[acol].astype('Int64')
    trawlsdata.loc[:, acol] = trawlsdata[acol].astype('Int64')
    cpuedata.loc[:, acol] = cpuedata[acol].astype('Int64')
# Drop outdated original data columns (up to date values will be extracted at DarwinCore alignment stage via the WoRMS API)
dropcols = ['Phylum','Class','Family','Common Name','Prior Name']
for acol in dropcols:
    trawlsdata.drop(acol, axis=1, inplace=True)

# Merge CPUE and Trawls data
df = pd.merge(cpuedata, trawlsdata, on=['Sample Code','Species Code'],how='left')

# -- Gear
query = 'SELECT * FROM "Alabama Gear"'
df = pd.merge(df, pd.read_sql(query, cnxn), on=['Gear Code'],how='left')

# -- Hydrological
query = 'SELECT * FROM "Alabama Hydrological"'
df = pd.merge(df, pd.read_sql(query, cnxn), on=['Sample Code'],how='left')

# -- Lengths
query = 'SELECT * FROM "Alabama Lengths"'
lengths = pd.read_sql(query, cnxn)
#df = pd.merge(df, pd.read_sql(query, cnxn), on=['Sample Code','Species Code'],how='left')
intcols = ['Species Code']
for acol in intcols:
    lengths.loc[:, acol] = lengths[acol].astype('Int64')

df = pd.merge(df, lengths, on=['Sample Code','Species Code'],how='left')


  trawlsdata.loc[:, acol] = trawlsdata[acol].astype('Int64')
  cpuedata.loc[:, acol] = cpuedata[acol].astype('Int64')
  lengths.loc[:, acol] = lengths[acol].astype('Int64')


In [2]:
# JATKA!!!

# Close the database connection
cnxn.close()

# Generate Datetime field
df['datestr'] = df['YYYY'].astype(str) + '-' + df['MM'].astype(str) + '-' + df['DD'].astype(str) + ' 12:00'
df['Datetime'] = pd.to_datetime(df['datestr'], utc=True)

# drop columns that are not needed for output
df.drop(['datestr', 'YYYY', 'MM', 'DD'], axis=1, inplace=True)

# sort by date
df.sort_values(['Datetime'], axis=0, ascending=True, inplace=True, ignore_index=True)
# Format datetime string
df['Datetime'] = df['Datetime'].dt.strftime('%Y-%m-%dT%H:%MZ')

# Get rid of spaces in column names
for acol in df.columns.to_list():
    df.rename({acol: acol.replace(" ","")}, axis="columns", inplace=True)

ParserError: day is out of range for month: 2004-2-30 12:00 present at position 1222

In [18]:
dftemp = df[df['MM'] == 2].copy()
baddates = dftemp[dftemp['DD'] > 28].copy()


In [24]:
baddates['Sample Code'].unique().tolist()

[9271, 9272]

In [17]:
dftemp[dftemp['YYYY'] == 1981]

Unnamed: 0,Sample Code,Station Code,YYYY,MM,DD,Species Code,cpue,Station,Description,Latitude,Longitude,Gear Code,Water Body,Measured,Total Number,Total Weight,Scientific Name,Commercial,Gear,Salinity,Temperature,DO,Length,datestr
0,5,21,1981,2,4,883544010200,0.0,21,Tall Range 'D';10/80 - 09/83,30.270,-88.15,1,Mississippi Sound,0,0,0,Cynoscion nebulosus,Y,Trawl (16 ft),22.0,9.0,9.4,,1981-2-4 12:00
1,5,21,1981,2,4,617701010100,0.0,21,Tall Range 'D';10/80 - 09/83,30.270,-88.15,1,Mississippi Sound,0,0,0,Farfantepenaeus aztecus,Y,Trawl (16 ft),22.0,9.0,9.4,,1981-2-4 12:00
2,5,21,1981,2,4,617701010200,0.0,21,Tall Range 'D';10/80 - 09/83,30.270,-88.15,1,Mississippi Sound,0,0,0,Farfantepenaeus duorarum,Y,Trawl (16 ft),22.0,9.0,9.4,,1981-2-4 12:00
3,5,21,1981,2,4,617701010300,0.0,21,Tall Range 'D';10/80 - 09/83,30.270,-88.15,1,Mississippi Sound,0,0,0,Litopenaeus setiferus,Y,Trawl (16 ft),22.0,9.0,9.4,,1981-2-4 12:00
4,5,21,1981,2,4,883601010100,0.0,21,Tall Range 'D';10/80 - 09/83,30.270,-88.15,1,Mississippi Sound,0,0,0,Mugil cephalus,Y,Trawl (16 ft),22.0,9.0,9.4,,1981-2-4 12:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1791,29,31,1981,2,20,617701010300,0.0,31,Arnica Bay;10/80 - present,30.305,-87.54,1,Arnica Bay,0,0,0,Litopenaeus setiferus,Y,Trawl (16 ft),15.0,13.0,7.2,,1981-2-20 12:00
1792,29,31,1981,2,20,883601010100,0.0,31,Arnica Bay;10/80 - present,30.305,-87.54,1,Arnica Bay,0,0,0,Mugil cephalus,Y,Trawl (16 ft),15.0,13.0,7.2,,1981-2-20 12:00
1793,29,31,1981,2,20,885703030400,0.0,31,Arnica Bay;10/80 - present,30.305,-87.54,1,Arnica Bay,0,0,0,Paralichthys lethostigma,N,Trawl (16 ft),15.0,13.0,7.2,,1981-2-20 12:00
1794,29,31,1981,2,20,883544080100,0.0,31,Arnica Bay;10/80 - present,30.305,-87.54,1,Arnica Bay,0,0,0,Pogonias cromis,N,Trawl (16 ft),15.0,13.0,7.2,,1981-2-20 12:00


In [None]:
cols_out = [ 'Datetime','Latitude','Longitude','SampleCode','StationCode','Description','Bay','SpeciesCode','ScientificName',
            'Phylum','Class','Order','Family','cpue','Measured','TotalNumber','TotalWeight','Length','Weight','Depth',
            'SurfaceSalinity','BottomSalinity','AverageSalinity','SurfaceTemperature','BottomTemperature','AverageTemperature',
            'SurfaceDO','BottomDO','AverageDO']

### OUTPUT TO FILE
# Write merged data out to a .csv file
df[cols_out].to_csv(outdir + "CAGES_CPUE_MS.csv", encoding='utf-8', index=False)

In [None]:
### Generate ERDDAP datasets .XML -snippet using templates according to variable type

## Paths 
# ouput
xml_output = "D:\\00-GCOOS\\00-MBON\\CAGES\\MS\\data\\debug\\" + "CAGES_MS_XML.txt"
# template files
xml_header_temp = "D:\\00-GCOOS\\00-MBON\\CAGES\\erdxml\\cages_erddap_xml\\" + "cages_header_template_MS.txt"
xml_int_temp = "D:\\00-GCOOS\\00-MBON\\CAGES\\erdxml\\cages_erddap_xml\\" + "int_xml_template.txt"
xml_float_temp = "D:\\00-GCOOS\\00-MBON\\CAGES\\erdxml\\cages_erddap_xml\\" + "float_xml_template.txt"
xml_string_temp = "D:\\00-GCOOS\\00-MBON\\CAGES\\erdxml\\cages_erddap_xml\\" + "string_xml_template.txt"

# get types
dt = df.dtypes.to_dict()
int_cols = []
float_cols = []
string_cols = []

# list types
for akey in dt.keys():
     if 'nt64' in str(dt[akey]):
        int_cols.append(akey)
     elif 'float' in str(dt[akey]):
         float_cols.append(akey)
     elif 'obj' in str(dt[akey]) or 'str' in str(dt[akey]):
         string_cols.append(akey)

# These belong to the header snippet (no need to generate separately):
header_vars = ['Datetime', 'Latitude', 'Longitude']

# START concatenating the tamplate snippets together
# --

# 1st, write the header to the output file
with open(xml_output, "w") as output_file:
    with open(xml_header_temp, "r") as file:
        output_file.write(file.read())

# Loop throgh the rest
for acol in cols_out:
    isPhysicalMeasurement = False
    if acol in header_vars:
        continue
    elif acol in int_cols:
        template_file = xml_int_temp
    elif acol in float_cols:
        template_file = xml_float_temp
        isPhysicalMeasurement = True
    else:
        template_file = xml_string_temp
    
    # Open the xml template file for reading
    with open(template_file, 'r') as tempfile:
        # Read the contents of the file
        contents = tempfile.read()
        # Modify the contents as needed
        mod_contents = contents.replace('_VARNAME_', acol)
        
        # Try to figure out the units for some
        if isPhysicalMeasurement:
            if 'temperature' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'degree_Celcius')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Temperature')
                if 'air' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'air_temperature')
                elif 'surface' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_surface_temperature')
                elif 'bottom' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_temperature_at_sea_floor')
                elif 'average' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_temperature')
            elif 'turbidity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'NTU')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Turbidity')
                mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_turbidity')
            elif 'salinity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'PSU')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Salinity')
                if 'surface' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_surface_salinity')
                elif 'bottom' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_salinity_at_sea_floor')
                elif 'average' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_salinity')
            elif 'DO' in acol or 'dissolved' in acol.lower() or 'oxygen' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'mg l-1')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Dissolved O2')
                mod_contents = mod_contents.replace('_DESTNAME_', acol)
            else:
                mod_contents = mod_contents.replace('_DESTNAME_',acol)

    # Open the ouput file for appending
    with open(xml_output, 'a') as file2:
        # Write the modified contents back to the file
        file2.write(mod_contents)

### Add the closing tag to the output xml file:
with open(xml_output, 'a') as output_file:
    output_file.write("</dataset>")

In [None]:
float_cols

In [None]:
int_cols