In [2]:
#helpful link about working with netcdf4: https://iescoders.com/writing-netcdf4-data-in-python/
#helpful link for strptime formatting: https://www.journaldev.com/23365/python-string-to-datetime-strptime
#helpful link with examples of netcdf4.date2num(): https://www.programcreek.com/python/example/89490/netCDF4.date2num
#a few imports and installations:
import netCDF4 as nc
from netCDF4 import Dataset as NetCDFFile
from netCDF4 import date2num
import xarray as xr
from datetime import datetime as dt
import numpy as np
!cd /Users/bennysmith/Documents/Climate_Science/Research_with_Baylor
!rm test.nc

#opening the .nc file with the data I'd like to use:
mydirectory = '/Users/bennysmith/Documents/Climate_Science/Research_with_Baylor'
ncstart = NetCDFFile(mydirectory+'/DOH_2018_SAFE.nc')
#defining the variables stored in the original netcdf:
BeachID_start = ncstart.variables['BeachID']
CollectionTime_start = ncstart.variables['CollectionTime']
Duplicate_start = ncstart.variables['Duplicate']
LabID_start = ncstart.variables['LabID']
SampleID_start = ncstart.variables['SampleID']
Sampler_start = ncstart.variables['Sampler']
StationID_start = ncstart.variables['StationID']
SWresults_start = ncstart.variables['SWresults']
Test_Type_start = ncstart.variables['Test_Type']
Validation_start = ncstart.variables['Vaildation'] #there's a typo in the original spreadsheet
WaterType_start = ncstart.variables['WaterType']

# for i in LabID_start:
#     print(i)
#     print(type(i))

#converting the strings representing time in the original netcdf file to numbers. Time will be measured in minutes since 2018-05-24 13:20:00 in this dataset.
timelist = []
for i in CollectionTime_start:
    timelist.append(dt.strptime(i, '%m/%d/%y %I:%M %p'))
timelist2 = date2num(timelist, 'minutes since 2018-05-24 13:20:00')

    
#Making the netcdf file:
mync = nc.Dataset('test.nc', 'w', format='NETCDF4')
time_dim = mync.createDimension('row', None)

#Below I create a bunch of variables. All are set up to depend on the row dimension alone. I think I chose suitable data types, and I think using numpy was a good way to do it. Helpful link about data types: https://pbpython.com/pandas_dtypes.html
time_var = mync.createVariable('time', np.float64, ('row',))
BeachID_var = mync.createVariable('BeachID', np.str, ('row',))
Duplicate_var = mync.createVariable('Duplicate', np.int64, ('row',))
LabID_var = mync.createVariable('LabID', np.int64, ('row',))
SampleID_var = mync.createVariable('SampleID', np.str, ('row',))
Sampler_var = mync.createVariable('Sampler', np.int64, ('row',))
StationID_var = mync.createVariable('StationID', np.str, ('row',))
SWresults_var = mync.createVariable('SWresults', np.int64, ('row',))
Test_Type_var = mync.createVariable('Test_Type', np.int64, ('row',))
Validation_var = mync.createVariable('Validation', np.int64, ('row',))
WaterType_var = mync.createVariable('WaterType', np.int64, ('row',))

time_var[:] = timelist2[:] #The index and colon are very important; if you don't include these it will just reset the values of the values, which prevents the data from being saved to the new netcdf.
BeachID_var[:] = BeachID_start[:]
Duplicate_var[:] = Duplicate_start[:]
LabID_var[:] = LabID_start[:]
SampleID_var[:] = SampleID_start[:]
Sampler_var[:] = Sampler_start[:]
StationID_var[:] = StationID_start[:]
SWresults_var[:] = SWresults_start[:]
Test_Type_var[:] = Test_Type_start[:]
Validation_var[:] = Validation_start[:]
WaterType_var[:] = WaterType_start[:]

#Defining attributes of the entire dataset. Not sure if these are correct, need to check them.
mync.title = 'DOH Data for EPA 2018'
mync.history = 'Converted from Excel spreadsheet provided by the Rhode Island Department of Health'
mync.institution = 'Rhode Island Department of Health'
mync.source = 'Rhode Island Department of Health lab measurements using samples from beach locations'
mync.comment = 'See elsewhere on the RIDDC ERDDAP server for information about a project by Brown University undergraduate Ella Wood using this data.'
mync.references = 'Data from Sherry Poucher at Rhode Island Department of Health, Sherry.Poucher@health.ri.gov'


#Defining attributes of each dataset variable:
time_var._CoordinateAxisType = 'Time'
time_var.actual_range = '0, 140951'
time_var.axis = 'T'
time_var.calendar = 'standard'
time_var.ioos_category = 'Time' #not sure what ioos_category means
time_var.long_name = 'EDT Sample Time' #I need to double check this...
time_var.short_name = 'time'
time_var.standard_name = 'time'
time_var.time_origin = '2018-05-24 13:20:00'
time_var.units = 'minutes since 2018-05-24 13:20:00'

BeachID_var.long_name = 'Beach ID'
BeachID_var.standard_name = 'Beach_ID'

#Still need to figure out what the Duplicate variable means. Add an explanation attribute?
Duplicate_var.long_name = 'Duplicate'
Duplicate_var.standard_name = 'Duplicate'

LabID_var.long_name = 'Lab ID'
LabID_var.standard_name = 'Lab_ID'

SampleID_var.long_name = 'Sample ID'
SampleID_var.standard_name = 'Sample_ID'

#Still need to figure out what Sampler means. Add an explanation attribute?
Sampler_var.long_name = 'Sampler'
Sampler_var.standard_name = 'Sampler'

#Add latitude & longitude based on the Station IDs?
StationID_var.long_name = 'Station ID'
StationID_var.standard_name = 'Station_ID'

#Change SWresults variable name to EnterococciCounts?
SWresults_var.actual_range = '10, 24200'
SWresults_var.long_name = 'Enterococci Counts in colony-forming units'
SWresults_var.short_name = 'ECounts'
SWresults_var.standard_name = 'Enterococci_Counts_in_colony-forming_units'
SWresults_var.units = 'colony-forming unit'

#cite someone named Sonia for the following info??
Test_Type_var.long_name = 'Test Type'
Test_Type_var.standard_name = 'Test_Type'
Test_Type_var.explanation = 'Test_Type = 1 means the Enterolert method. Test_Type = 2 means the membrane filtration method, which is no longer used.'

#Need to figure out what Validation means. Add an explanation attribute?
Validation_var.long_name = 'Validation'
Validation_var.standard_name = 'Validation'

#Need to figure out what WaterType means. Add an explanation attribute?
WaterType_var.long_name = 'Water Type'
WaterType_var.standard_name = 'Water_Type'

print(mync)
# unique = []
# for i in StationID_var:
#     #print(i)
#     if i not in unique:
#         unique.append(i)
# print(len(unique))
# uniquetime = []
# for i in time:
#     if i not in uniquetime:
#         uniquetime.append(i)
# print(len(uniquetime))
        
mync.close() #this is a VERY important line!! If you don't close the file, not all the data gets written

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    title: DOH Data for EPA 2018
    history: Converted from Excel spreadsheet provided by the Rhode Island Department of Health
    institution: Rhode Island Department of Health
    source: Rhode Island Department of Health lab measurements using samples from beach locations
    comment: See elsewhere on the RIDDC ERDDAP server for information about a project by Brown University undergraduate Ella Wood using this data.
    references: Data from Sherry Poucher at Rhode Island Department of Health, Sherry.Poucher@health.ri.gov
    dimensions(sizes): row(1461)
    variables(dimensions): float64 [4mtime[0m(row), <class 'str'> [4mBeachID[0m(row), int64 [4mDuplicate[0m(row), int64 [4mLabID[0m(row), <class 'str'> [4mSampleID[0m(row), int64 [4mSampler[0m(row), <class 'str'> [4mStationID[0m(row), int64 [4mSWresults[0m(row), int64 [4mTest_Type[0m(row), int64 [4mValidation[0m(row), int64 [

In [1]:
mydirectory = '/Users/bennysmith/Documents/Climate_Science/Research_with_Baylor'
from netCDF4 import Dataset as NetCDFFile
import matplotlib.pyplot as plt
import xarray as xr
ncnew = NetCDFFile(mydirectory+'/test.nc')
SWresults = ncnew.variables['SWresults'][:]
StationID = ncnew.variables['StationID']
time = ncnew.variables['time'][:]
SWresults_list = []
for i in SWresults:
    SWresults_list.append(i)
print(max(SWresults_list))
print(min(SWresults_list))
#plt.plot(time, SWresults)

24200
10
