In [38]:
from typing import Union, Optional, Dict
from pathlib import Path
import json
import pandas as pd

In [32]:
def read_file(
        data_filepath: Union[str, Path],
        site: str,
        network: str,
        inlet: Optional[str] = None,
        instrument: Optional[str] = "shinyei",
        sampling_period: Optional[str] = None,
        measurement_type: Optional[str] = None,
    ) -> Dict:
        """Read BEACO2N data files

        Args:
            filepath: Data filepath
            site: Site name
        Returns:
            dict: Dictionary of data
        """
        import pandas as pd
        from numpy import nan as np_nan
#         from openghg.util import load_json
        from collections import defaultdict
#         from openghg.util import clean_string

        if sampling_period is None:
            sampling_period = "NOT_SET"

        datetime_columns = {"time": ["datetime"]}
        rename_cols = {
            "PM_ug/m3": "pm",
            "PM_ug/m3_QC_level": "pm_qc",
            "co2_ppm": "co2",
            "co2_ppm_QC_level": "co2_qc",
            "co_ppm": "co",
            "co_ppm_QC_level": "co_qc",
        }

        use_cols = [1, 5, 6, 7, 8, 9, 10]
        data_filepath = Path(data_filepath)

        try:
            data = pd.read_csv(
                data_filepath,
                index_col="time",
                parse_dates=datetime_columns,
                na_values=[-999.0, "1a"],
                usecols=use_cols,
            )
        except ValueError as e:
            raise ValueError(
                f"Unable to read data file, please make sure it is in the standard BEACO2N format.\nError: {e}"
            )

#         beaco2n_site_data = load_json("beaco2n_site_data.json")
        
        beaco2n_site_data = json.loads(Path("/Users/gar/Documents/Devel/openghg/openghg/data/beaco2n_site_data.json").read_text())

        try:
            site_metadata = beaco2n_site_data[site.upper()]
        except KeyError:
            raise ValueError(f"Site {site} not recognized.")

        site_metadata["comment"] = "Retrieved from http://beacon.berkeley.edu/"

        # Set all values below zero to NaN
        data[data < 0] = np_nan
        data = data.rename(columns=rename_cols)

        measurement_types = ["pm", "co2"]
        units = {"pm": "ug/m3", "co2": "ppm"}

        gas_data: DefaultDict[str, Dict[str, Union[DataFrame, Dict]]] = defaultdict(dict)
        for mt in measurement_types:
            m_data = data[[mt, f"{mt}_qc"]]
#             m_data = m_data.dropna(axis="rows", how="any")

            species_metadata = {
                "units": units[mt],
                "site": str(site),
                "species": str(mt),
                "inlet": "NA",
                "network": "beaco2n",
                "sampling_period": str(sampling_period),
            }

            gas_data[mt]["data"] = m_data
            gas_data[mt]["metadata"] = species_metadata
            gas_data[mt]["attributes"] = site_metadata

        # TODO - add CF Compliant attributes?

        return gas_data

In [33]:
data_path = "/Users/gar/Sync/web-scrape/beaco2n/data/174_HILLPARKSECONDARYSCHOOL.csv"

In [34]:
data = read_file(data_filepath=data_path, site="HILLPARKSECONDARYSCHOOL", network="BEACO2N", inlet="50m")

In [35]:
data

defaultdict(dict,
            {'pm': {'data':                        pm  pm_qc
              time                            
              2021-07-28 09:00:00   NaN    NaN
              2021-07-28 10:00:00   2.8    NaN
              2021-07-28 11:00:00   3.9    NaN
              2021-07-28 12:00:00   2.5    NaN
              2021-07-28 13:00:00   2.9    NaN
              ...                   ...    ...
              2021-09-29 11:00:00   5.9    NaN
              2021-09-29 12:00:00   6.4    NaN
              2021-09-29 13:00:00   4.8    NaN
              2021-09-29 14:00:00   8.9    NaN
              2021-09-29 15:00:00  10.3    NaN
              
              [1516 rows x 2 columns],
              'metadata': {'units': 'ug/m3',
               'site': 'HILLPARKSECONDARYSCHOOL',
               'species': 'pm',
               'inlet': 'NA',
               'network': 'beaco2n',
               'sampling_period': 'NOT_SET'},
              'attributes': {'deployed': '2021-07-28',
        

In [41]:
datetime_columns = {"time": ["datetime"]}
rename_cols = {
    "PM_ug/m3": "pm",
    "PM_ug/m3_QC_level": "pm_qc",
    "co2_ppm": "co2",
    "co2_ppm_QC_level": "co2_qc",
    "co_ppm": "co",
    "co_ppm_QC_level": "co_qc",
}

# use_cols = [1, 5, 6, 7, 8, 9, 10]
data_path = Path(data_path)

data = pd.read_csv(
                data_path,
                index_col="time",
                parse_dates=datetime_columns,

            )

In [42]:
data

Unnamed: 0_level_0,local_timestamp,node_id,epoch,julian_day,PM_ug/m3,PM_ug/m3_QC_level,co_ppm,co_ppm_QC_level,co2_ppm,co2_ppm_QC_level
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-07-28 09:00:00,2021-07-28 02:00:00,174,1.627463e+09,209.375000,-999.0,,-999.00,1a,-999.0,
2021-07-28 10:00:00,2021-07-28 03:00:00,174,1.627466e+09,209.416667,2.8,,-999.00,1a,416.6,
2021-07-28 11:00:00,2021-07-28 04:00:00,174,1.627470e+09,209.458333,3.9,,0.02,1a,417.3,
2021-07-28 12:00:00,2021-07-28 05:00:00,174,1.627474e+09,209.500000,2.5,,0.04,1a,417.2,
2021-07-28 13:00:00,2021-07-28 06:00:00,174,1.627477e+09,209.541667,2.9,,0.03,1a,416.0,
...,...,...,...,...,...,...,...,...,...,...
2021-09-29 11:00:00,2021-09-29 04:00:00,174,1.632913e+09,272.458333,5.9,,0.25,1a,421.2,
2021-09-29 12:00:00,2021-09-29 05:00:00,174,1.632917e+09,272.500000,6.4,,0.27,1a,418.1,
2021-09-29 13:00:00,2021-09-29 06:00:00,174,1.632920e+09,272.541667,4.8,,0.31,1a,421.5,
2021-09-29 14:00:00,2021-09-29 07:00:00,174,1.632924e+09,272.583333,8.9,,0.34,1a,418.4,
