# Quality Controlling a generic data object

### Objective:

This notebook shows how to use CoTeDe with a netcdf file (using xarray), or a pandas dataframe.

In [76]:
import cotede
print("CoTeDe version: {}".format(cotede.__version__))

CoTeDe version: 0.20.2


In [77]:
# install xarray/pandas
!pip install xarray



In [79]:
# Import requirements

from datetime import datetime

import numpy as np
from numpy import ma
from cotede.qc import ProfileQC
import pandas as pd
import xarray as xr

# xarray/netcdf

In [80]:
# load up data in xarray from netcdf file
mydata = xr.open_dataset('gak.nc')
mydata

In [81]:
# define qc config
cfg = {
    'main': {
        'valid_datetime': None,
    },
    'sea_water_temperature': {
        'global_range': {
            'minval': 4.0,
            'maxval': 5.0,
        },
    },
    'sea_water_practical_salinity': {
        'global_range': {
            'minval': 29.0,
            'maxval': 32.0,
        },
    },
}
cfg

{'main': {'valid_datetime': None},
 'sea_water_temperature': {'global_range': {'minval': 4.0, 'maxval': 5.0}},
 'sea_water_practical_salinity': {'global_range': {'minval': 29.0,
   'maxval': 32.0}}}

In [82]:
# run qc
pqced = ProfileQC(mydata, cfg=cfg, attributes={})

  flag[np.nonzero(feature < minval)] = self.flag_bad
  flag[np.nonzero(feature > maxval)] = self.flag_bad
  flag[np.nonzero(idx)] = self.flag_good
  flag[np.nonzero(feature < minval)] = self.flag_bad
  flag[np.nonzero(feature > maxval)] = self.flag_bad
  flag[np.nonzero(idx)] = self.flag_good


In [83]:
# inspect flags
pqced.flags.keys()

dict_keys(['common', 'sea_water_practical_salinity', 'sea_water_temperature'])

In [84]:
pqced.flags['sea_water_temperature'].keys()

dict_keys(['valid_datetime', 'global_range', 'overall'])

In [85]:
pqced.flags['sea_water_temperature']['global_range'].transpose()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 4, 4, 4, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1]], dtype=int8)

In [73]:
pqced.flags['sea_water_temperature']['overall'].transpose()

array([[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3]], dtype=int8)

# pandas dataframe

In [89]:
# load up data in pandas from csv file
df = pd.read_csv('gak.csv', skiprows=[1])
df.head()

Unnamed: 0,time,latitude,longitude,z,sea_water_practical_salinity,sea_water_temperature
0,2020-02-28T06:17:00Z,59.85,-149.5,0.0,31.31,4.09
1,2020-02-28T03:17:00Z,59.85,-149.5,0.0,31.41,4.29
2,2020-02-28T00:17:00Z,59.85,-149.5,0.0,31.52,4.49
3,2020-02-27T21:17:00Z,59.85,-149.5,0.0,31.55,4.53
4,2020-02-27T18:17:00Z,59.85,-149.5,0.0,31.55,4.52


In [90]:
# run qc
pqced_df = ProfileQC(df, cfg=cfg, attributes={})

ValueError: Length of passed values is 1, index implies 45.