In [None]:
import time
start_time = time.time()

import numpy as np
import pandas as pd
import csv
import pyworms

indir = 'https://gcoos4.geos.tamu.edu/WAF/MBON/CAGES/TX/raw_data/'
#indir = 'D:\\00-GCOOS\\00-MBON\\CAGES\\TX\\data\\raw\\'

# The path separator for the input data (local/web/Windows/Linux differ)
sep = '/'  #sep = '\\'

outdir = 'D:\\00-GCOOS\\00-MBON\\CAGES\\TX\\data\\merged\\'
dbgdir = 'D:\\00-GCOOS\\00-MBON\\CAGES\\TX\\data\\debug\\'

# Define range of years to process
firstyear = 1982
lastyear = 2005

# Define static input files 
station_file = indir + 'CAGES_Texas_Stations_f408_c675_c3df.csv'
subbay_file = indir + 'CAGES_Texas_SubBays_b8ce_c104_0861.csv'
hydro_file = indir + 'CAGES_Texas_Hydrological.csv'
length_file = indir + 'CAGES_Texas_Lengths_f6a9_dc7f_aff8.csv'
trawls_file = indir + 'CAGES_Texas_Trawls_b0be_9212_e952.csv'
species_file = indir + 'CAGES_TX_species.csv'
#old_species_file = indir + 'CAGES_Texas_species_matched' + '.csv' # this was manually collected, hard to maintain

# Read in the common data (non-annual files)
station_data = pd.read_csv(station_file)
bay_data  = pd.read_csv(subbay_file)
hydro_data = pd.read_csv(hydro_file)
length_data = pd.read_csv(length_file)
trawl_data = pd.read_csv(trawls_file)

# Define the output file paths
# --
# intermediate files for debugging, commented-out ...
# fileout1 = outdir + 'Texas_sam_sta' +'.csv'
# fileout2 = outdir + 'Texas_sam_sta_bay' +'.csv'
# fileout3 = outdir + 'Texas_sam_sta_bay_hyd' +'.csv'
# fileout4 = outdir + 'Texas_sam_sta_bay_hyd_cpue' +'.csv'
# fileout5 = outdir + 'Texas_sam_sta_bay_hyd_cpue_len' +'.csv'

# For debugging missing species taxonomy data:
missing_specs = pd.DataFrame(columns=['Species_Code','Scientific_Name','AphiaID','scientificName','scientificNameAuthorship','taxonomicStatus','taxonRank','scientificNameID','valid_AphiaID','acceptedNameUsage','acceptedScientificNameAuthorship','kingdom','phylum','class','order','family','genus','isMarine','isBrackish','isFreshwater','isTerrestrial'])

# Loop through all years
for ayear in range(firstyear,lastyear+1):
    year_str = str(ayear)

    # Define the merged and WoRMS taxonomy -added output:
    fileout_merged = outdir + 'CAGES_Texas_merged_' + year_str + '.csv'
    
    print(f'--- Starting {year_str} data ...')
    
    # Get the annual data
    sample_file = indir + year_str + sep + 'CAGES_Texas_Samples_' + year_str + '.csv'
    cpue_file = indir + year_str + sep + 'CAGES_Texas_CPUE_' + year_str + '.csv'

    # ADDING SAMPLE AND STATION DATA
    sample_data = pd.read_csv(sample_file)
    data_test0 = pd.merge(sample_data, station_data, on=['Station_Code'], how='inner')
    #data_test0.to_csv(fileout1)
    del sample_data
    # ADDING BAY CODES
    data_test1 = pd.merge(data_test0, bay_data, on=['Bay_Code','SubBay_Code'],how='left')
    #data_test1.to_csv(fileout2)
    del data_test0
    # ADDING HYDRO DATA
    data_test2 = pd.merge(data_test1, hydro_data, on=['Sample_Code'],how='inner')
    #data_test2.to_csv(fileout3)
    del data_test1
    # ADDING CPUE DATA
    cpue_data  = pd.read_csv(cpue_file)
    data_test3 = pd.merge(data_test2, cpue_data, on=['Sample_Code','YYYY','MM','DD','Date','Bay_Code'],how='inner')
    #data_test3.to_csv(fileout4)
    del cpue_data
    del data_test2
    # ADDING LENGTH DATA
    data_test4 = pd.merge(data_test3, length_data, on=['Sample_Code','Species_Code'],how='left')
    #data_test4.to_csv(fileout5)
    del data_test3
    # ADDING TRAWL DATA
    data_test5 = pd.merge(data_test4, trawl_data, on=['Sample_Code','Species_Code'],how='left')
    # DEBUG: print out intermediate ersult before adding taxonomy
    fileout6 = dbgdir + 'Texas_sam_sta_bay_hyd_cpue_len_tr_' + year_str + '.csv'
    data_test5.to_csv(fileout6)
    del data_test4

    # ADDING TAXONOMY
    # --
    #list of all species
    specs = data_test5[['Species_Code','Scientific_Name']].drop_duplicates().copy()

    # Query WoRMS for the taxonomic data for each spec, and add those as new columns
    for index, row in specs.iterrows():
        aspec = row['Scientific_Name']
        print(aspec,'...')
        # IDEA: could have just the pyworms API call in the try branch. 
        # Why: because now some species report as failed due to just some fields not populating
        # ... should only report fail when the API call fails!
        try:
            response = pyworms.aphiaRecordsByMatchNames(aspec,marine_only=False)
        except:
            print(f'--- Warning! {year_str} WoRMS API call for {aspec} failed!')

        try:
            resp = response[0][0]
            specs.loc[index, 'AphiaID'] = pd.Series(resp['AphiaID'], dtype='Int64')[0]
            specs.loc[index, 'scientificName'] = resp['scientificname']
            specs.loc[index, 'scientificNameAuthorship'] = resp['authority']
            specs.loc[index, 'taxonomicStatus'] = resp['status']
            specs.loc[index, 'taxonRank'] = resp['rank']
            specs.loc[index, 'scientificNameID'] = resp['lsid']

            specs.loc[index, 'valid_AphiaID'] = int(resp['valid_AphiaID'])
            specs.loc[index, 'acceptedNameUsage'] = resp['valid_name']
            specs.loc[index, 'acceptedScientificNameAuthorship'] = resp['valid_authority']

            #Kingdom,Phylum,Class,Order,Family,Genus,Subgenus,Species
            specs.loc[index, 'kingdom'] = resp['kingdom']
            specs.loc[index, 'phylum'] = resp['phylum']
            specs.loc[index, 'class'] = resp['class']
            specs.loc[index, 'order'] = resp['order']
            specs.loc[index, 'family'] = resp['family']
            specs.loc[index, 'genus'] = resp['genus']

            specs.loc[index, 'isMarine'] = int(resp['isMarine'])
            specs.loc[index, 'isBrackish'] = int(resp['isBrackish'])
            specs.loc[index, 'isFreshwater'] = int(resp['isFreshwater'])
            specs.loc[index, 'isTerrestrial'] = int(resp['isTerrestrial'])
        except:
            print(f'--- Warning! {year_str} WoRMS API response for {aspec} has issues, check the output!')

    # Read datafile for species (manually composed, for species that fail via API)
    species_lookup = pd.read_csv(species_file)

    # Update missing values in specs with values in species_lookup
    # --
    # Set 'Species_Code' column as index in both DataFrames
    specs = specs.set_index('Species_Code')
    species_lookup = species_lookup.set_index('Species_Code')
    specs.update(species_lookup, overwrite=False)
    specs = specs.reset_index()

    # NaN values force integer columns to float, convert back to nullable integer (Int64)
    # --
    intcols = ['AphiaID', 'valid_AphiaID', 'isMarine', 'isBrackish', 'isFreshwater', 'isTerrestrial']
    for acol in intcols:
        specs[acol] = specs[acol].astype('Int64')
    
    # Check that there are no null values! (unaccounted for species)
    # Collect troublemaker list to a dataframe
    # --
    missing_specs_annual = specs[specs['AphiaID'].isnull()]
    miss_intcols = ['Species_Code', 'AphiaID', 'valid_AphiaID', 'isMarine', 'isBrackish', 'isFreshwater', 'isTerrestrial']
    if len(missing_specs_annual) != 0:
        print(f"--- Error! {year_str} files have issues with taxonomy data.")
        missing_specs = pd.concat([missing_specs, missing_specs_annual], ignore_index=True)
        for acol in miss_intcols:
            missing_specs[acol] = missing_specs[acol].astype('Int64')
        missing_specs.drop_duplicates(inplace=True)
    
    # Merge the newly collected species data (from Worms via API)
    # (also get rid of duplicate 'Scientific_Name' columns)
    # --
    data_test6 = pd.merge(data_test5.drop(columns=['Scientific_Name']), specs.drop(columns=['Scientific_Name']), on='Species_Code',how='left')

    # Write out the merged file, make sure all characters print OK
    # --
    data_test6.to_csv(fileout_merged, encoding='utf-8', index=False)

    # delete loop variables
    del specs, species_lookup, intcols, data_test5, data_test6, missing_specs_annual
    print(f'--- End year {year_str}.')
    # --
    ### END FOR
                                 
# print merged missing species data for debugging:
missing_specs.to_csv(dbgdir + 'missing_species.csv')

# delete common variables 
del station_data, bay_data, hydro_data, length_data, trawl_data, missing_specs
print('--- Done! ---')
print("--- %s seconds ---" % (time.time() - start_time))