In [1]:
import numpy as np
import pandas as pd
import pyodbc
import csv
import pyworms

# DEBUG
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from warnings import filterwarnings
filterwarnings("ignore", category=UserWarning, message='.*')

# Output file path
outdir = "D:\\00-GCOOS\\00-MBON\\CAGES\\LA\\data\\merged\\"

# THE ORIGINAL CAGES DATABASE FILE is served via WAF but cannot be queried from there
# -> download to local HD
#db_file = 'https://gcoos4.geos.tamu.edu/WAF/MBON/CAGES/CAGES.accdb'
db_file = "D:\\00-GCOOS\\00-MBON\\CAGES_ORIG\CAGES.accdb"
user = 'user'
password = 'pw'
# open the database connection:
cnxn = pyodbc.connect('DRIVER={{Microsoft Access Driver (*.mdb, *.accdb)}};DBQ={};Uid={};Pwd={};'.format(db_file, user, password))


### GET DATA TABLES
### ---

# Get Sample data
query = 'SELECT * FROM "Louisiana Samples"'
sampledata = pd.read_sql(query, cnxn)

# Get Stations data                     
query = 'SELECT * FROM "Louisiana Stations"'
stationdata = pd.read_sql(query, cnxn)

# Get Hydro data                     
query = 'SELECT * FROM "Louisiana Hydrological"'
hydrodata = pd.read_sql(query, cnxn)
# Hydro data had duplicate rows with same sample code but some missing values in some of the duplicate rows
# -> Get rid of duplicates
hydrodata = hydrodata.groupby('Sample Code').first().reset_index()
# Convert any 'Method' column from float to Int64
intcols = ['Salinity Method', 'Air Temperature Method', 'Water Temperature Method', 'Turbidity Method']
for acol in intcols:
    hydrodata[acol] = hydrodata[acol].astype('Int64')

# Get Gear data                     
query = 'SELECT * FROM "Louisiana Gear"'
geardata = pd.read_sql(query, cnxn)
# Convert 'Gear Code' to type Int64 for merging
intcols = ['Gear Code']
for acol in intcols:
    geardata[acol] = geardata[acol].astype('Int64')

# Get bio data                     
query = 'SELECT * FROM "Louisiana Biological"'
biodata = pd.read_sql(query, cnxn)

# Get CPUE data                     
query = 'SELECT * FROM "Louisiana CPUE"'
cpuedata = pd.read_sql(query, cnxn)

# Get Species data                     
query = 'SELECT * FROM "Louisiana Species"'
speciesdata = pd.read_sql(query, cnxn)

# Get Length data, fill NaN with 0 (for merging), correct some column types
query = 'SELECT * FROM "Louisiana Lengths"'
lengthdata = pd.read_sql(query, cnxn)
lengthdata.fillna(value=0, inplace=True)
intcols = ['Individual Weight Units', 'Stage', 'Species Observation', 'Gear Observation Code']
for acol in intcols:
    lengthdata[acol] = lengthdata[acol].astype('Int64')
    
# Get additional data
# --------------------

# Get Length units data                     
query = 'SELECT * FROM "Louisiana Length Units"'
lenunitdata = pd.read_sql(query, cnxn)

intcols = ['Length Units Code']
for acol in intcols:
    lenunitdata[acol] = lenunitdata[acol].astype('Int64')

# Get Stage data                     
query = 'SELECT * FROM "Louisiana Stage"'
stagedata = pd.read_sql(query, cnxn)
stagedata['Stage Code'] = stagedata['Stage Code'].astype('Int64')

# Get Measurement methods data                     
query = 'SELECT * FROM "Louisiana Physical Methods"'
physmethodsdata = pd.read_sql(query, cnxn)
physmethodsdata['Method Code'] = physmethodsdata['Method Code'].astype('Int64')

# Get Numbering Method data
query = 'SELECT * FROM "Louisiana Numbering Methods"'
numbermethodsdata = pd.read_sql(query, cnxn)

# Close the database connection
cnxn.close()

### MERGE DATA TABLES
### ---

# Merge the sample and station data
# --
sample_station_data = pd.merge(sampledata, stationdata,	 on=['Station Code','CSA'],how='inner')
# Convert Time and Gear Code from float to Int64
intcols = ['Time', 'Gear Code']
for acol in intcols:
    sample_station_data[acol] = sample_station_data[acol].astype('Int64')
    
# Merge hydro data
# --
merged_hydro = pd.merge(sample_station_data, hydrodata, on='Sample Code',how='left')

# Merge gear data
# --
merged_gear = pd.merge(merged_hydro, geardata, on='Gear Code',how='left')

# Merge bio data
# --
merged_bio = pd.merge(merged_gear, biodata, on='Sample Code', how='inner')
# Convert some columns to more appropriate type of Int64
intcols = ['Duration (Units)', 'Number Measured']
for acol in intcols:
    merged_bio[acol] = merged_bio[acol].astype('Int64')

# Merge CPUE data
# --
merged_cpue = pd.merge(merged_bio, cpuedata, on=['Sample Code','CSA','Station','YYYY','MM','DD','Species Code'],how='inner')

# Merge Species code/name data
# --
merged_species = pd.merge(merged_cpue, speciesdata, on=['Species Code'],how='left')

# Convert some columns to more appropriate type of Int64
intcols = ['Gear Observation Code']
for acol in intcols:
    merged_species[acol] = merged_species[acol].astype('Int64')
    lengthdata[acol] = lengthdata[acol].astype('Int64')

# Merge Length
# --
merged_length = pd.merge(merged_species, lengthdata, on=['Sample Code', 'Species Code', 'Gear Observation Code'], how='left')

# Some columns' type get converted to float in merge -> convert back to integer
intcols = ['Duration', 'Length Interval', 'Length (Units)', 'Length Measurement', 'Length Group']
for acol in intcols:
    merged_length[acol] = merged_length[acol].astype('Int64')

# Merge Length Units
# --
merged_lengthunits = pd.merge(merged_length, lenunitdata, left_on=['Length (Units)'], right_on=['Length Units Code'], how='left')
# Drop duplicate column
merged_lengthunits.drop(['Length (Units)'], axis=1, inplace=True)
# Rename the merged Units column
merged_lengthunits.rename({"Units": "Length Units"}, axis="columns", inplace=True)

# Merge Stage explanations
# --
merged_stage = pd.merge(merged_lengthunits, stagedata, left_on=['Stage'], right_on=['Stage Code'], how='left')
# Drop duplicate column
merged_stage.drop(['Stage'], axis=1, inplace=True)

# Drop ['Species Observations', 'Species Observation']
merged_stage.drop(['Species Observations', 'Species Observation'], axis=1, inplace=True)
#merged_stage.rename({"Length (Units)": "Length Units"}, axis="columns", inplace=True)
merged_stage.reset_index()

# Add physical measurement method descriptions
# --
merged = merged_stage.copy()
methods_arr = ['Salinity Method', 'Air Temperature Method', 'Water Temperature Method', 'Turbidity Method']
for amethod in methods_arr:
    #codename = amethod.replace('Method','Code')
    codename = amethod + ' Code'
    merged.rename({amethod: codename}, axis="columns", inplace=True)
    meth = pd.merge(merged, physmethodsdata, left_on=[codename], right_on=['Method Code'], how='left').Method
    merged = pd.concat([merged, meth], axis=1).rename({"Method": amethod}, axis="columns")

# Merge Numbering Method data
numbermeth_data = pd.merge(merged, numbermethodsdata, left_on=['Total Number Method'], right_on=['Number Method Code'], how='left').Method

merged = pd.concat([merged, numbermeth_data], axis=1).rename({"Total Number Method": "Total Number Method Code", "Method": "Total Number Method"}, axis="columns")

# Add a datetime field
merged['Time'].fillna(value=1200, inplace=True)
merged['timestr'] = merged['Time'].astype(str)
merged['minute'] = merged['timestr'].str[-2:]
merged['hour'] = merged['timestr'].str[:-2]
merged['hour'].replace(to_replace="", value="00", inplace=True)
merged['datestr'] = merged['YYYY'].astype(str) + '-' + merged['MM'].astype(str) + '-' + merged['DD'].astype(str) + ' ' + merged['hour'] + ':' + merged['minute']
merged['Datetime'] = pd.to_datetime(merged['datestr'])

# drop columns that are not needed for output
merged.drop(['Duration (Units)', 'timestr', 'minute', 'hour', 'datestr', 'YYYY', 'MM', 'DD'], axis=1, inplace=True)


### OUTPUT TO FILE
## -------

# sort by date
merged.sort_values(['Datetime'], axis=0, ascending=True, inplace=True, ignore_index=True)

# Format datetime string
merged['Datetime'] = merged['Datetime'].dt.strftime('%Y-%m-%dT%H:%MZ')

# Rename 'Average Temperature'
merged.rename({'Average Temperature': 'Average Water Temperature'}, axis="columns", inplace=True)

# Get rid of spaces in column names
for acol in merged.columns.to_list():
    merged.rename({acol: acol.replace(" ","")}, axis="columns", inplace=True)

# select columns to output
cols_to_print = ['Datetime', 'Latitude', 'Longitude', 'SampleCode', 'CSA', 'StationCode', 'GearCode', 'Station', 'SiteName',
                 'SubBay', 'SalinityMethodCode', 'SurfaceSalinity', 'BottomSalinity', 'AverageSalinity', 'AirTemperatureMethodCode', 
                 'AirTemperature', 'WaterTemperatureMethodCode', 'SurfaceWaterTemperature', 'BottomWaterTemperature',
                 'AverageWaterTemperature', 'TurbidityMethodCode', 'Turbidity', 'SalinityMethod', 'AirTemperatureMethod', 
                 'WaterTemperatureMethod', 'TurbidityMethod', 'Gear', 'GearObservationCode', 'Duration', 'SpeciesCode',
                 'TotalNumber', 'TotalNumberMethod', 'TotalNumberMethodCode', 'NumberMeasured', 'cpue', 'ScientificName', 'CommonName', 
                 'LengthInterval', 'IndividualWeightUnits', 'LengthMeasurement', 'LengthUnits', 'LengthUnitsCode', 'LengthGroup', 
                 'StageCode', 'MaturityStage']

# Write merged data out to a .csv file
merged[cols_to_print].to_csv(outdir + "CAGES_CPUE_LA_merged.csv", encoding='utf-8', index=False)