In [1]:
import xarray as xr
import pandas as pd
import numpy as np

# Convert and clean up datasets

In [2]:
def convert_str_data(data, var):
    return data[var].str.decode('utf-8')#, errors='coerce')

In [3]:
def convert_numeric_data(data, var):
    return pd.to_numeric(data[var], errors='coerce')

# 1. Phytoplankton

In [6]:
phyto = xr.open_dataset("phytoplankton.netcdf")#, decode_cf=True)

In [8]:
phyto['Cruise_ID_2'] = phyto.Cruise_ID_2.astype('str')
phyto['Time_start_local'] = phyto.Time_start_local.astype('str')
phyto['Time_end_local'] = phyto.Time_end_local.astype('str')

In [9]:
phyto['SpeciesNameOriginal'] = convert_str_data(phyto, 'SpeciesNameOriginal')
phyto['SpeciesNameCleaned'] = convert_str_data(phyto, 'SpeciesNameCleaned')
phyto['ScientificName_accepted'] = convert_str_data(phyto, 'ScientificName_accepted')

In [10]:
phyto['Cruise'] = phyto.Cruise.astype('int')
phyto['Leg'] = phyto.Leg.astype('int')
phyto['Cast'] = phyto.Cast.astype('int')

phyto['Day_local'] = phyto.Day_local.astype('int')
phyto['Month_local'] = phyto.Month_local.astype('int')
phyto['Year_local'] = phyto.Year_local.astype('int')

In [11]:
phyto.Datetime_local.values = pd.to_datetime(phyto.Datetime_local.astype('str').values)
phyto.Datetime_UTC.values = pd.to_datetime(phyto.Datetime_UTC.astype('str').values)

phyto.Datetime_local.values = pd.to_datetime(phyto.Datetime_local.astype('str').values)
phyto.Datetime_UTC.values = pd.to_datetime(phyto.Datetime_UTC.astype('str').values)

In [12]:
phyto['AphiaID'].values = convert_numeric_data(phyto, 'AphiaID').astype(int)

phyto['Latitude'].values = convert_numeric_data(phyto, 'Latitude')
phyto['Longitude'].values = convert_numeric_data(phyto, 'Longitude')

phyto['d_1m'].values = convert_numeric_data(phyto, 'd_1m')
phyto['d_7m'].values = convert_numeric_data(phyto, 'd_7m')
phyto['d_15m'].values = convert_numeric_data(phyto, 'd_15m')
phyto['d_25m'].values = convert_numeric_data(phyto, 'd_25m')
phyto['d_35m'].values = convert_numeric_data(phyto, 'd_35m')
phyto['d_55m'].values = convert_numeric_data(phyto, 'd_55m')
phyto['d_75m'].values = convert_numeric_data(phyto, 'd_75m')
phyto['d_100m'].values = convert_numeric_data(phyto, 'd_100m')
phyto['Total_sum'].values = convert_numeric_data(phyto, 'Total_sum')

In [13]:
phyto

In [14]:
phyto_df = phyto.to_dataframe()

In [18]:
phyto_df.stack()

unlimited             
0          Cruise               1
           Cruise_ID_2    CAR-001
           Leg                  2
           Cast                 2
           Day_local            8
                           ...   
96840      d_35m                0
           d_55m                0
           d_75m                0
           d_100m               0
           Total_sum            0
Length: 2516627, dtype: object

In [16]:
phyto_df.to_csv("phytoplankton.csv")

# 2. Biogeochemistry 

In [2]:
biogeochembact = xr.open_dataset("biogeochembact.netcdf")

In [3]:
biogeochembact_numeric = ['year', 'month', 'day', 'hour', 'decimal_year', 'Longitude', 'Latitude',
                         'Cruise_number', 'Cruise_Leg', 'Cast_number', 'Depth', 'Corrected_Depth',
                        'Winkler_O2_avg', 'Winkler_O2_flags', 'NO3', 'NO2', 'NH4', 'PO4', 'H2S', 'CH4', 
                        'Acetate', 'Propionate', 'Incorporation_Acetate_Uptake_rate_constant',
                        'Respiration_Acetate_Uptake_rate_constant', 'Total_Acetate_uptake_rate_constant', 
                        'Sulfite', 'SD_Sulfite', 'Thiosulfate', 'SD_Thiosulfate', 'Particulate_Elemental_sulfur',
                        'SD_Particulate_Elemental_sulfur', 'Total_zero_valent_sulfur', 'SD_Total_zero_valent_sulfur', 
                        'Total_Prokaryote_Cell_Density', 'SD_Total_Prokaryotes', 'Total_Prokaryote_Biomass_Estimates',
                        'SD_Prokaryote_Biomass_Estimates', 'Cyanobacteria', 'SD_Cyanobacteria', 'Methanogens', 'SD_Methanogens',
                        'Flagellated_Protists', 'SD_Flagellated_Protists', 'Ciliated_Protists', 'SD_Ciliated_Protists', 
                        'Viral_Like_Particles_VLP', 'SD_Viral_Like_Particles_VLP', 'Heterotrophic_Bacterial_Production', 
                        'SD_Heterotrophic_Bacterial_Production', 'Dark_carbon_fixation_rate', 'SD_Dark_carbon_fixation_rate',
                        'dissolved_Mn', 'SD_dissolved_Mn', 'dissolved_Fe', 'SD_dissolved_Fe'
                         ]

biogeochembact_datetime = ['ISO_DateTime_UTC', 'ISO_DateTime_local']

biogeochembact_string = ['Station', 'Biogeochemical_Cruise_ID', 'Core_CARIACO_ID', 'Notes',
                        'Comments', 'H2S_comment', 'CH4_comment', 'Particulate_element_sulfur_flags',
                        'Total_zero_valent_Sulfur_flags', ]

In [4]:
biogeochembact['ISO_DateTime_UTC'].values = pd.to_datetime(biogeochembact['ISO_DateTime_UTC'].astype('str').values).tz_convert(None) 

In [5]:
biogeochembact['ISO_DateTime_local'].values = pd.to_datetime(biogeochembact['ISO_DateTime_local'].astype('str').values)

In [9]:
for i in range(len(biogeochembact_numeric)):
    biogeochembact[biogeochembact_numeric[i]].values = convert_numeric_data(biogeochembact, biogeochembact_numeric[i]) #.astype(int)

In [15]:
for i in range(len(biogeochembact_string)):
    biogeochembact[biogeochembact_string[i]].values = convert_str_data(biogeochembact, biogeochembact_string[i]) #.astype(int)

In [16]:
biogeochembact #.chunk()

In [17]:
biogeochembact.to_dataframe().to_csv('biogeochembact_dtypes.csv')

# 3. NISKIN

In [12]:
niskin = xr.open_dataset("niskin.netcdf")
niskin.attrs

{}

In [5]:
niskin_numeric = ['Cruise_number', 'Leg', 'Day', 'Month', 'Year', 'Latitude', 'Longitude',
                'Hydro_cast_no',
                'Depth_target',
                'Depth_real',
                'O2_ml_L',
                'O2_umol_kg',  
                'NO3_UDO',
                'PO4_UDO',
                'SiO4_UDO',
                'NH4_USF',
                'NO2_USF',
                'NO3_NO2_USF',
                'PO4_USF',
                'SiO4_USF',
                'pH',
                'Alkalinity_mol_kg',
                'Alkalinity_umol_kg',
                'TCO2',
                'fCO2',
                'pH_corrected',
                'TCO2_corrected',
                'fCO2_corrected',
                'Salinity_bottles',
                'Salinity_CTD',
                'Temperature',
                'Sigma_t',
                'TPP',
                'PIP',
                'POC_ug_kg',
                'PON_ug_kg',
                'POC_ug_L',
                'PN_ug_L',
                'C_N_particulate',
                'DON',
                'DOP',
                'DOC',
                'TOC',
                'PrimaryProductivity',
                'Chlorophyll',
                'Phaeopigments',
                'Total_Prokaryotes',
                'Bact_Biomass_mgC_m3',
                'Bact_Biomass_uMC',
                'Bio_cast_no'
                                         ]

niskin_datetime = ['ISO_DateTime_start_hc_local', 'ISO_DateTime_end_hc_local', 
                  'ISO_DateTime_start_bc_local', 'ISO_DateTime_end_bc_local', 
                  'ISO_DateTime_start_hc_UTC', 'ISO_DateTime_end_hc_UTC', 
                  'ISO_DateTime_start_bc_UTC', 'ISO_DateTime_end_bc_UTC']


niskin_string = ['Cruise_ID_1', 'Cruise_ID_2']


In [6]:
niskin['ISO_DateTime_start_hc_UTC'].values = pd.to_datetime(niskin['ISO_DateTime_start_hc_UTC'].astype('str').values)#.tz_convert(None) 
niskin['ISO_DateTime_end_hc_UTC'].values = pd.to_datetime(niskin['ISO_DateTime_end_hc_UTC'].astype('str').values)#.tz_convert(None) 
niskin['ISO_DateTime_start_bc_UTC'].values = pd.to_datetime(niskin['ISO_DateTime_start_bc_UTC'].astype('str').values)#.tz_convert(None) 
niskin['ISO_DateTime_end_bc_UTC'].values = pd.to_datetime(niskin['ISO_DateTime_end_bc_UTC'].astype('str').values)#.tz_convert(None) 

In [7]:
niskin['ISO_DateTime_start_hc_local'].values = pd.to_datetime(niskin['ISO_DateTime_start_hc_local'].astype('str').values)
niskin['ISO_DateTime_end_hc_local'].values = pd.to_datetime(niskin['ISO_DateTime_end_hc_local'].astype('str').values)
niskin['ISO_DateTime_start_bc_local'].values = pd.to_datetime(niskin['ISO_DateTime_start_bc_local'].astype('str').values)
niskin['ISO_DateTime_end_bc_local'].values = pd.to_datetime(niskin['ISO_DateTime_end_bc_local'].astype('str').values)


In [8]:
for i in range(len(niskin_numeric)):
    niskin[niskin_numeric[i]].values = convert_numeric_data(niskin, niskin_numeric[i]) #.astype(int)

In [9]:
for i in range(len(niskin_string)):
    niskin[niskin_string[i]].values = convert_str_data(niskin, niskin_string[i]) #.astype(int)

In [11]:
niskin.Chlorophyll.attrs

{}

In [27]:
#niskin.to_dataframe().to_csv('niskin_dtypes.csv')

# 4. CTD

In [14]:
ctd = xr.open_dataset("ctd.netcdf")

In [101]:
ctd.data_vars

Data variables:
    cruise_no   (unlimited) float64 ...
    Cruise_ID1  (unlimited) |S513 ...
    Cruise_ID2  (unlimited) |S513 ...
    Year        (unlimited) float64 ...
    Month       (unlimited) float64 ...
    Day         (unlimited) float64 ...
    Date        (unlimited) |S513 ...
    Latitude    (unlimited) float64 ...
    Longitude   (unlimited) float64 ...
    press       (unlimited) float64 ...
    depth       (unlimited) float64 ...
    temp        (unlimited) |S513 ...
    sal         (unlimited) float64 ...
    potemp      (unlimited) |S513 ...
    sigma_t     (unlimited) |S513 ...
    sigma_0     (unlimited) |S513 ...
    O2_ml_L     (unlimited) |S513 ...
    beam_cp     (unlimited) |S513 ...
    beam_att    (unlimited) |S513 ...
    fluor_CTD   (unlimited) float64 ...
    fluor_chla  (unlimited) |S513 ...

# 5. Zooplankton

In [102]:
zooplankton = xr.open_dataset("zooplankton.netcdf")

In [103]:
zooplankton.data_vars

Data variables:
    Cruise             (unlimited) float64 ...
    Cruise_ID          (unlimited) |S513 ...
    Day                (unlimited) float64 ...
    Month              (unlimited) float64 ...
    Year               (unlimited) float64 ...
    Date               (unlimited) |S513 ...
    Latitude           (unlimited) float64 ...
    Longitude          (unlimited) float64 ...
    Analyst            (unlimited) |S513 ...
    Mesh_Size          (unlimited) float64 ...
    TOTAL_DENSITY      (unlimited) float64 ...
    BIOMASS            (unlimited) float64 ...
    ASH                (unlimited) float64 ...
    COPEPODS           (unlimited) float64 ...
    CALANOIDS          (unlimited) |S513 ...
    CYCLOPOIDA         (unlimited) |S513 ...
    HAPARCTICOIDA      (unlimited) |S513 ...
    POECILOSTOMATOIDA  (unlimited) |S513 ...
    L_FISH             (unlimited) float64 ...
    H_FISH             (unlimited) float64 ...
    CHAETOGNATHA       (unlimited) float64 ...
    CLADOCE

# 6. Sediment Traps

In [104]:
sediment_trap = xr.open_dataset("Sediment_Trap.netcdf")

In [105]:
sediment_trap.data_vars

Data variables:
    trap_ID     (unlimited) |S513 ...
    depth_trap  (unlimited) float64 ...
    lon         (unlimited) float64 ...
    lat         (unlimited) float64 ...
    sample_num  (unlimited) float64 ...
    date_open   (unlimited) float64 ...
    Year        (unlimited) float64 ...
    Month       (unlimited) float64 ...
    Day         (unlimited) float64 ...
    duration_d  (unlimited) float64 ...
    MF_Total    (unlimited) |S513 ...
    MF_Corg     (unlimited) |S513 ...
    MF_CaCO3    (unlimited) |S513 ...
    MF_Sibio    (unlimited) |S513 ...
    MF_Terr     (unlimited) |S513 ...
    MF_N        (unlimited) |S513 ...