In [None]:
import re
import warnings
import pandas as pd
from IPython.display import display
pd.set_option('display.max_columns', 200)
warnings.filterwarnings('ignore', category=UserWarning)

# Original Excel source file from the data provider stored on a GCOOS WAF folder
sourcefile = 'https://gcoos4.geos.tamu.edu/WAF/MBON/JuvenileSportFish/2016JuvenileSportfishNOAA.xlsx'
# Output dir
outdir = "D:\\00-GCOOS\\00-MBON\\JuvenileFish\\data\\"

# -- Read the "Sportfish Data" sheet from the Excel file
# --
df_spt = pd.read_excel(sourcefile, sheet_name='Sportfish Data')

# -- Read the "Species Code" sheet from the Excel file
# --
df_spc = pd.read_excel(sourcefile, sheet_name='Species Code')
# Remove trailing (and leading) whitespace characters
df_spc['Species'] = df_spc['Species'].str.strip()

# -- Read the "All Data" sheet from the Excel file
# --
df_all = pd.read_excel(sourcefile, sheet_name='All Data')
# Replace special chars in column names
df_all.columns = [col.replace('%', 'percent') for col in df_all.columns]

# Drop columns that are not needed
df_all.drop(columns=['Cyn (lengths 30-200 mm)','Cyn (lengths 30-200 mm) pres/abs only','Lug (lengths 30-260 mm)  pres/abs only'], inplace=True)
# Rename some columns
df_all = df_all.rename(columns={'Cyn ALL Lengths': 'cyn', 'Lug (lengths 30-260 mm)': 'lug'})
# List of count columns to rename and melt:
count_cols = [ 'Ari','Arp','Bab','Chf','cyn','Epi','Has','Hie','Hpa','Hyp','Lam','Lar','lug','Lum','Lun','Lus','Mym','Occ','Paa','Pab','Pal','Poc','Scb','Sco','Sev','Spa']
# Make all lower case (to match with species code list later)
for acol in count_cols:
    df_all = df_all.rename(columns={acol: acol.lower()})
counts_lower = [item.lower() for item in count_cols]

# -- Melt the dataframe to long form
# --
# Melt species count columns to just SpeciesCode and SpeciesCount)
df_long = pd.melt(df_all,
                  id_vars=[col for col in df_all.columns if col not in counts_lower], # other columns to keep
                  value_vars=counts_lower,        # species columns
                  var_name='SpeciesCode', 
                  value_name='SpeciesCount')

# -- Fix Time
# -- 
# Fill missing times with a default
df_long['Time'] = df_long['Time'].fillna(1200)
# Convert to integer (from float)
df_long['Time'] = df_long['Time'].astype('int')

# -- Add a Datetime field (date and time)
# --
df_long['timestr'] = df_long['Time'].astype(str)
df_long['minute'] = df_long['timestr'].str[-2:]
# Clean dirty data: minute has values like "78", should probably be "18"
def replace_second_last_char_if_target(s, new_char, target_char):
    s = str(s)  # Ensure it's a string
    if len(s) < 2 or s[-2] != target_char:
        return s
    return s[:-2] + new_char + s[-1]
# DIRTY DATA: replace second last character in the minute column, '7' with '1'
df_long['minute'] = df_long['minute'].apply(lambda x: replace_second_last_char_if_target(x, '1', '7'))
df_long['hour'] = df_long['timestr'].str[:-2]
df_long['hour'].replace(to_replace="", value="00", inplace=True)
#df_long['Date'] = pd.to_datetime(df_long['Date'])
df_long['Datetime'] = pd.to_datetime(df_long['Date'].dt.strftime('%Y-%m-%d') + ' ' + df_long['hour'] + ':' + df_long['Day'].astype(str) + ' ' + df_long['hour'] + ':' + df_long['minute'])
df_long['Datetime'] = df_long['Datetime'].dt.tz_localize('EST')
df_long['Datetime'] = df_long['Datetime'].dt.tz_convert('UTC')
# Format datetime string ()
df_long['Datetime'] = df_long['Datetime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
# Drop the extra fields
df_long.drop(['timestr', 'minute', 'hour', 'Date'], axis=1, inplace=True)

# -- Merge Sportfish data
# -- 
merged_df = pd.merge(df_long, df_spt, left_on=['Station','Year','Month','SpeciesCode'], right_on=['Station','Year','Month','Species Code'], how='left').copy()

# Drop unnecessary columns
###drop_these = ['Zone_y','Species Code','Common Name', 'Code']
###merged_df.drop(columns=drop_these, inplace=True)


# Re-rename Zone column that was renamed by merge
merged_df = merged_df.rename(columns={'Zone_x': 'Zone'})

# -- Merge species code data 
# --
merged_df2 = pd.merge(merged_df, df_spc, left_on=['SpeciesCode'], right_on=['Code'], how='left').copy()

# Drop unnecessary columns
drop_these = ['Zone_y','Species Code','Common Name', 'Code']
merged_df2.drop(columns=drop_these, inplace=True)


# -- New dataframe: Group by Keyfield, add up total SpeciesCount 
# --
grouped = merged_df2.groupby('Keyfield').agg({
    'Keyfield': 'first',
     'Station': 'first',
     'Location': 'first',
     'Latitude': 'first',
     'Longitude': 'first',
     'Year': 'first',
     'Month': 'first',
     'Day': 'first',
     'Time': 'first',
     'Zone': 'first',
     'Area Towed': 'first',
     'Salinity': 'first',
     'Temperature': 'first',
     'Grass  percent Cover rep 1': 'first',
     'Grass  percent Cover rep 2': 'first',
     'Grass  percent Cover rep 3': 'first',
     'Grass  percent Cover rep 4': 'first',
     'Grass  percent Cover rep 5': 'first',
     'Grass  percent Cover rep 6': 'first',
     'Grass  percent Cover rep 7': 'first',
     'Grass  percent Cover rep 8': 'first',
     'Grass  percent Cover rep 9': 'first',
     'T  percent Cover rep 1': 'first',
     'T  percent Cover rep 2': 'first',
     'T  percent Cover rep 3': 'first',
     'T  percent Cover rep 4': 'first',
     'T  percent Cover rep 5': 'first',
     'T  percent Cover rep 6': 'first',
     'T  percent Cover rep 7': 'first',
     'T  percent Cover rep 8': 'first',
     'T  percent Cover rep 9': 'first',
     'T Canopy Height rep 1': 'first',
     'T Canopy Height rep 2': 'first',
     'T Canopy Height rep 3': 'first',
     'T Canopy Height rep 4': 'first',
     'T Canopy Height rep 5': 'first',
     'T Canopy Height rep 6': 'first',
     'T Canopy Height rep 7': 'first',
     'T Canopy Height rep 8': 'first',
     'T Canopy Height rep 9': 'first',
     'S  percent Cover rep 1': 'first',
     'S  percent Cover rep 2': 'first',
     'S  percent Cover rep 3': 'first',
     'S  percent Cover rep 4': 'first',
     'S  percent Cover rep 5': 'first',
     'S  percent Cover rep 6': 'first',
     'S  percent Cover rep 7': 'first',
     'S  percent Cover rep 8': 'first',
     'S  percent Cover rep 9': 'first',
     'S Canopy Height rep 1': 'first',
     'S Canopy Height rep 2': 'first',
     'S Canopy Height rep 3': 'first',
     'S Canopy Height rep 4': 'first',
     'S Canopy Height rep 5': 'first',
     'S Canopy Height rep 6': 'first',
     'S Canopy Height rep 7': 'first',
     'S Canopy Height rep 8': 'first',
     'S Canopy Height rep 9': 'first',
     'H  percent Cover rep 1': 'first',
     'H  percent Cover rep 2': 'first',
     'H  percent Cover rep 3': 'first',
     'H  percent Cover rep 4': 'first',
     'H  percent Cover rep 5': 'first',
     'H  percent Cover rep 6': 'first',
     'H  percent Cover rep 7': 'first',
     'H  percent Cover rep 8': 'first',
     'H  percent Cover rep 9': 'first',
     'H Canopy Height Rep 1': 'first',
     'H Canopy Height Rep 2': 'first',
     'H Canopy Height Rep 3': 'first',
     'H Canopy Height Rep 4': 'first',
     'H Canopy Height Rep 5': 'first',
     'H Canopy Height Rep 6': 'first',
     'H Canopy Height Rep 7': 'first',
     'H Canopy Height Rep 8': 'first',
     'H Canopy Height Rep 9': 'first',
     'SAV percent Cover rep 1': 'first',
     'SAV percent Cover rep 2': 'first',
     'SAV percent Cover rep 3': 'first',
     'SAV percent Cover rep 4': 'first',
     'SAV percent Cover rep 5': 'first',
     'SAV percent Cover rep 6': 'first',
     'SAV percent Cover rep 7': 'first',
     'SAV percent Cover rep 8': 'first',
     'SAV percent Cover rep 9': 'first',
     'Total percent Cover rep 1': 'first',
     'Total percent Cover rep 2': 'first',
     'Total percent Cover rep 3': 'first',
     'Total percent Cover rep 4': 'first',
     'Total percent Cover rep 5': 'first',
     'Total percent Cover rep 6': 'first',
     'Total percent Cover rep 7': 'first',
     'Total percent Cover rep 8': 'first',
     'Total percent Cover rep 9': 'first',
     'Algae percent Cover rep 1': 'first',
     'Algae percent Cover rep 2': 'first',
     'Algae percent Cover rep 3': 'first',
     'Algae percent Cover rep 4': 'first',
     'Algae percent Cover rep 5': 'first',
     'Algae percent Cover rep 6': 'first',
     'Algae percent Cover rep 7': 'first',
     'Algae percent Cover rep 8': 'first',
     'Algae percent Cover rep 9': 'first',
     'Cyn Density': 'first',
     'Average Grass percent Cover': 'first',
     'Average T percent Cover': 'first',
     'Average S percent Cover': 'first',
     'Average H percent Cover': 'first',
     'Average T Canopy Height': 'first',
     'Average S Canopy Height': 'first',
     'Average H Canopy Height': 'first',
     'SpeciesCode': 'first',
     'SpeciesCount': 'sum',
     'Datetime': 'first',
     'Length (mm)': 'sum',
     'Species': 'first',
     'Common name': 'first'
})

# Save Lines where SpeciesCount is zero: no need to have duplicates of these in the data
zerocounts_df = grouped[grouped['SpeciesCount'] == 0].copy()

# -- 1) Delete from the merged dataframe all lines with Keyfield value found in zerocounts_df 
# -- 2) Delete from the merged dataframe all lines with SpeciesCount == 0
# -- 3) Concatenate back in one line per Keyfield that had zero total SpeciesCount (to preserve )
# -- (No added value as all counts are 0)
# -- 
values_to_drop = zerocounts_df['Keyfield'].unique()
merged_df2.drop(merged_df2[merged_df2['Keyfield'].isin(values_to_drop)].index, inplace=True)
# Delete from the merged dataframe other lines with zero counts (some species in the sample are present)
merged_df2.drop(merged_df2[merged_df2['SpeciesCount'] == 0].index, inplace=True)
# Concatenate with the zero count dataframe (to keep the benthic coverage data, but just one line per Keyfield / sample)
concat_df =  pd.concat([merged_df2, zerocounts_df], ignore_index=True)

# Sort by date/sample
df_sorted = concat_df.sort_values(by='Keyfield').reset_index(drop=True)

# Rename some columns
df_sorted = df_sorted.rename(columns={'Common name': 'CommonName', 'Length (mm)': 'IndividualLength', 'Species': 'ScientificName'})
# Get rid of spaces in column names
for acol in df_sorted.columns.to_list():
    df_sorted.rename({acol: re.sub(r'\s+', '_', acol)}, axis="columns", inplace=True)
# print sorted cols, Collection Stn columns -> join 'by' which columns???
#df_sorted.columns.to_list()

#df_sorted.columns.to_list()
cols_out = ['Datetime','Latitude','Longitude','Keyfield','Station','Location','Zone','Area_Towed',
            'SpeciesCode','SpeciesCount','IndividualLength','ScientificName','CommonName',
            'Salinity','Temperature','Grass_percent_Cover_rep_1','Grass_percent_Cover_rep_2',
            'Grass_percent_Cover_rep_3', 'Grass_percent_Cover_rep_4', 'Grass_percent_Cover_rep_5',
            'Grass_percent_Cover_rep_6','Grass_percent_Cover_rep_7','Grass_percent_Cover_rep_8',
            'Grass_percent_Cover_rep_9','T_percent_Cover_rep_1','T_percent_Cover_rep_2',
            'T_percent_Cover_rep_3','T_percent_Cover_rep_4', 'T_percent_Cover_rep_5', 'T_percent_Cover_rep_6',
            'T_percent_Cover_rep_7','T_percent_Cover_rep_8','T_percent_Cover_rep_9', 'T_Canopy_Height_rep_1',
            'T_Canopy_Height_rep_2','T_Canopy_Height_rep_3','T_Canopy_Height_rep_4','T_Canopy_Height_rep_5',
            'T_Canopy_Height_rep_6','T_Canopy_Height_rep_7','T_Canopy_Height_rep_8','T_Canopy_Height_rep_9',
            'S_percent_Cover_rep_1','S_percent_Cover_rep_2','S_percent_Cover_rep_3', 'S_percent_Cover_rep_4',
            'S_percent_Cover_rep_5','S_percent_Cover_rep_6','S_percent_Cover_rep_7','S_percent_Cover_rep_8',
            'S_percent_Cover_rep_9','S_Canopy_Height_rep_1','S_Canopy_Height_rep_2','S_Canopy_Height_rep_3',
            'S_Canopy_Height_rep_4','S_Canopy_Height_rep_5','S_Canopy_Height_rep_6','S_Canopy_Height_rep_7',
            'S_Canopy_Height_rep_8','S_Canopy_Height_rep_9','H_percent_Cover_rep_1','H_percent_Cover_rep_2',
            'H_percent_Cover_rep_3','H_percent_Cover_rep_4','H_percent_Cover_rep_5','H_percent_Cover_rep_6',
            'H_percent_Cover_rep_7','H_percent_Cover_rep_8','H_percent_Cover_rep_9','H_Canopy_Height_Rep_1',
            'H_Canopy_Height_Rep_2','H_Canopy_Height_Rep_3','H_Canopy_Height_Rep_4','H_Canopy_Height_Rep_5',
            'H_Canopy_Height_Rep_6','H_Canopy_Height_Rep_7','H_Canopy_Height_Rep_8','H_Canopy_Height_Rep_9',
            'SAV_percent_Cover_rep_1','SAV_percent_Cover_rep_2','SAV_percent_Cover_rep_3',
            'SAV_percent_Cover_rep_4','SAV_percent_Cover_rep_5','SAV_percent_Cover_rep_6','SAV_percent_Cover_rep_7',
            'SAV_percent_Cover_rep_8','SAV_percent_Cover_rep_9','Total_percent_Cover_rep_1', 'Total_percent_Cover_rep_2',
            'Total_percent_Cover_rep_3','Total_percent_Cover_rep_4','Total_percent_Cover_rep_5','Total_percent_Cover_rep_6',
            'Total_percent_Cover_rep_7','Total_percent_Cover_rep_8','Total_percent_Cover_rep_9',
            'Algae_percent_Cover_rep_1','Algae_percent_Cover_rep_2','Algae_percent_Cover_rep_3','Algae_percent_Cover_rep_4',
            'Algae_percent_Cover_rep_5','Algae_percent_Cover_rep_6','Algae_percent_Cover_rep_7',
            'Algae_percent_Cover_rep_8','Algae_percent_Cover_rep_9','Cyn_Density','Average_Grass_percent_Cover',
            'Average_T_percent_Cover','Average_S_percent_Cover','Average_H_percent_Cover','Average_T_Canopy_Height',
            'Average_S_Canopy_Height','Average_H_Canopy_Height']

In [None]:
# SAVE DATA TO .CSV FILE:
outdir = "D:\\00-GCOOS\\00-MBON\\JuvenileFish\\data\\"
df_sorted[cols_out].to_csv(outdir + "juvenilefish_sportfishlengths.csv", encoding='utf-8', index=False)

# ERDDAP config preparation
# --
# Make dictionary for variable descriptions:
# Read the header key sheet into a data frame
df_dscrn = pd.read_excel(sourcefile, sheet_name='Header Key')

# Convert the two columns to a dictionary
# Assumption: first column is key, second column is value
descr_dict = dict(zip(df_dscrn.iloc[:,0], df_dscrn.iloc[:,1]))

# Remove keys that no longer exist in the data frame
keys_to_remove = ['Ari', 'Arp', 'Bab', 'Chf', 'Cyn ALL Lengths', 'Cyn (lengths 30-200 mm)', 'Cyn (lengths 30-200 mm) pres/abs only', 'Epi', 'Has', 'Hie', 'Hpa', 'Hyp', 'Lam', 'Lar', 'Lug (lengths 30-260 mm)', 'Lug (lengths 30-260 mm)  pres/abs only', 'Lum', 'Lun', 'Lus', 'Mym', 'Occ', 'Paa', 'Pab', 'Pal', 'Poc', 'Scb', 'Sco', 'Sev', 'Spa']
for key in keys_to_remove:
    descr_dict.pop(key, None)  # Safe: does nothing if key missing
# replace % special char with "percent"
descr_dict_new = {k.replace('%', 'percent'): v.replace('%', 'percent') for k, v in descr_dict.items()}
# replace space with underscore in the keys
descr_dict_new = {k.replace(' ', '_'): v for k, v in descr_dict_new.items()}
# Add missing column/variable descriptors
descr_dict_new.update({'SpeciesCode': 'Identifier for the scientic name', 'SpeciesCount': 'Number of individuals of the species, collected in the sample', 'Datetime': 'Date and time converted to UTC from the original local time (-05)', 'IndividualLength': 'Measured length of a collected individual of the sportfish species', 'ScientificName': 'The scientific name of the collected specimen', 'CommonName': 'The common name of the collected specimen'})


### Generate ERDDAP datasets .XML -snippet using templates according to variable type
## Paths
# ouput
xml_output = outdir + "juvenile_sportfish_xml.txt"

# template files
xml_header_temp = "juvenilefish_header_template.txt"
xml_int_temp = "xml_template_int.txt"
xml_float_temp = "xml_template_float.txt"
xml_string_temp = "xml_template_string.txt"

# get types
dt = df_sorted.dtypes.to_dict()
int_cols = []
float_cols = []
string_cols = []

# list types
for akey in dt.keys():
     if 'int' in str(dt[akey]).lower():
        int_cols.append(akey)
     elif 'float' in str(dt[akey]):
         float_cols.append(akey)
     elif 'obj' in str(dt[akey]) or 'str' in str(dt[akey]):
         string_cols.append(akey)

# These belong to the header snippet (no need to generate separately):
header_vars = ['Datetime', 'Latitude', 'Longitude']

# START concatenating the template snippets together
# --

# 1st, write the header to the output file
with open(xml_output, "w") as output_file:
    with open(xml_header_temp, "r") as file:
        output_file.write(file.read())

# Loop through the rest
for acol in cols_out:
    isPhysicalMeasurement = False
    if acol in header_vars:
        continue
    elif acol in int_cols:
        template_file = xml_int_temp
    elif acol in float_cols:
        template_file = xml_float_temp
        isPhysicalMeasurement = True
    else:
        template_file = xml_string_temp
    
    # Open the xml template file for reading    
    with open(template_file, 'r') as tempfile:
        # Read the contents of the file
        contents = tempfile.read()
        # Modify the contents as needed
        mod_contents = contents.replace('_VARNAME_', acol)
        mod_contents = mod_contents.replace('_DESCR_', descr_dict_new[acol])
        
        # Try to figure out the units for some
        if isPhysicalMeasurement:
            if 'temperature' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'degree_Celcius')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Temperature')
                if 'air' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'air_temperature')
                elif 'surface' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_surface_temperature')
                elif 'bottom' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_temperature_at_sea_floor')
                else:
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_temperature')
            elif 'turbidity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'NTU')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Turbidity')
                mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_turbidity')
            elif 'salinity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'PSU')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Salinity')
                if 'surface' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_surface_salinity')
                elif 'bottom' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_salinity_at_sea_floor')
                else:
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_salinity')
            #sea_water_electrical_conductivity
            elif 'conductivity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'S m-1')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Conductivity')
                mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_electrical_conductivity')
            elif 'DO' in acol or 'dissolved' in acol.lower() or 'oxygen' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'mg l-1')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Dissolved O2')
                mod_contents = mod_contents.replace('_DESTNAME_', 'mass_concentration_of_oxygen_in_sea_water')
            elif 'height' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'mm')
            elif 'percent' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'percent')
            else:
                mod_contents = mod_contents.replace('_DESTNAME_',acol)

    # Open the ouput file for appending
    with open(xml_output, 'a') as file2:
        # Write the modified contents back to the file
        file2.write(mod_contents)

### Add the closing tag to the output xml file:
with open(xml_output, 'a') as output_file:
    output_file.write("</dataset>")

In [None]:
## Process Collection Stn Data and save it as a separate dataset
## --

# Read the "Collection Stn Data" sheet from the Excel file
# --
df_cln = pd.read_excel(sourcefile, sheet_name='Collection Stn Data')

# For collection Stn data:
# 3) Generate Keyfield column to map the data to the other published dataset: AllData + Sportfish Data
df_cln['Keyfield'] = df_cln['Year'].astype(str).str.zfill(4) + df_cln['Month'].astype(str).str.zfill(2) + \
                     df_cln['Station'].astype(str).str.zfill(3)

# Drop unnecessary / erroneous columns
df_cln.drop(columns=['Year','Month','Day', 'Date'], inplace=True)

# Remove units (in parentheses) from column names
df_cln.columns = df_cln.columns.str.replace(r' \(.+\)', '', regex=True)
# Remove space from column names
df_cln.columns = df_cln.columns.str.replace(r' ', '_')
# Remove other special char from column names
df_cln.columns = df_cln.columns.str.replace(r'.', '', regex=False)

# measurement (float) columns
meas_cols = ['Standard_Length', 'Total_Length', 'weight', 'Length']

# Convert measurement columns to float
for acol in meas_cols:
    df_cln[acol] = pd.to_numeric(df_cln[acol], errors='coerce')

# Convert Keyfield to numeric
df_cln['Keyfield'] = pd.to_numeric(df_cln['Keyfield'], errors='coerce')
# 1) clean the length data: DROP rows where Length NaN
df_cln = df_cln.dropna(subset=['Length'])

# Add lat, lon, datetime field from the benthic, sportfish length dataframe
df_right_unique = df_sorted.drop_duplicates(subset=['Keyfield'])
df_cln = pd.merge(df_cln, df_right_unique[['Keyfield','Datetime', 'Latitude', 'Longitude']], on='Keyfield', how='left')

In [None]:
# Write out Collection Stn Data to separate CSV
cols_out = ['Datetime','Latitude','Longitude','Keyfield','Station','Genus_sp','Common_Name','Standard_Length','Total_Length','weight','Length','Species_Code']
df_cln[cols_out].to_csv(outdir + "juvenilefish_CollectionStnData.csv", encoding='utf-8', index=False)


### Generate ERDDAP XML for Cln Stn dataset using templates according to variable type
## Paths
# ouput
xml_output = outdir + "juvenile_sportfish_collectionStn_xml.txt"

# template files
xml_header_temp = "juvenilefish_header_template.txt"
xml_int_temp = "xml_template_int.txt"
xml_float_temp = "xml_template_float.txt"
xml_string_temp = "xml_template_string.txt"

# get types
dt = df_cln.dtypes.to_dict()
int_cols = []
float_cols = []
string_cols = []

# list types
for akey in dt.keys():
     if 'int' in str(dt[akey]).lower():
        int_cols.append(akey)
     elif 'float' in str(dt[akey]):
         float_cols.append(akey)
     elif 'obj' in str(dt[akey]) or 'str' in str(dt[akey]):
         string_cols.append(akey)

# These belong to the header snippet (no need to generate separately):
header_vars = ['Datetime', 'Latitude', 'Longitude']

# START concatenating the template snippets together
# --

# 1st, write the header to the output file
with open(xml_output, "w") as output_file:
    with open(xml_header_temp, "r") as file:
        output_file.write(file.read())

# Loop through the rest
for acol in cols_out:
    isPhysicalMeasurement = False
    if acol in header_vars:
        continue
    elif acol in int_cols:
        template_file = xml_int_temp
    elif acol in float_cols:
        template_file = xml_float_temp
        isPhysicalMeasurement = True
    else:
        template_file = xml_string_temp
    
    # Open the xml template file for reading
    with open(template_file, 'r') as tempfile:
        # Read the contents of the file
        contents = tempfile.read()
        # Modify the contents as needed
        mod_contents = contents.replace('_VARNAME_', acol)
        
        # Try to figure out the units for some
        if isPhysicalMeasurement:
            if 'temperature' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'degree_Celcius')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Temperature')
                if 'air' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'air_temperature')
                elif 'surface' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_surface_temperature')
                elif 'bottom' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_temperature_at_sea_floor')
                else:
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_temperature')
            elif 'turbidity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'NTU')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Turbidity')
                mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_turbidity')
            elif 'salinity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'PSU')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Salinity')
                if 'surface' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_surface_salinity')
                elif 'bottom' in acol.lower():
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_salinity_at_sea_floor')
                else:
                    mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_salinity')
            #sea_water_electrical_conductivity
            elif 'conductivity' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'S m-1')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Conductivity')
                mod_contents = mod_contents.replace('_DESTNAME_', 'sea_water_electrical_conductivity')
            elif 'DO' in acol or 'dissolved' in acol.lower() or 'oxygen' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'mg l-1')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Dissolved O2')
                mod_contents = mod_contents.replace('_DESTNAME_', 'mass_concentration_of_oxygen_in_sea_water')
            elif 'height' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'mm')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Other')
                mod_contents = mod_contents.replace('_DESTNAME_', acol)
            elif 'length' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'mm')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Other')
                mod_contents = mod_contents.replace('_DESTNAME_', acol)
            elif 'weight' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'g')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Other')
                mod_contents = mod_contents.replace('_DESTNAME_', acol)
            elif 'percent' in acol.lower():
                mod_contents = mod_contents.replace('_UNITS_', 'percent')
                mod_contents = mod_contents.replace('_CATEGORY_', 'Other')
                mod_contents = mod_contents.replace('_DESTNAME_', acol)
            else:
                mod_contents = mod_contents.replace('_DESTNAME_',acol)

    # Open the ouput file for appending
    with open(xml_output, 'a') as file2:
        # Write the modified contents back to the file
        file2.write(mod_contents)

### Add the closing tag to the output xml file:
with open(xml_output, 'a') as output_file:
    output_file.write("</dataset>")

In [None]:
# # DEBUG: find which variables have no description. 
# dictkeys = descr_dict_new.keys()
# for acol in df_sorted.columns:
#     if acol not in dictkeys:
#         print(acol)