## **LakeBeD-US Benchmark: Low Frequency Data Wrangling**

## **Setup**

In [1]:
# IMPORT PACKAGES
import os
import pandas as pd

In [2]:
# SET WORKING DIRECTORY
os.chdir("/projects/ml4science/LakeBeD-US/LakeBeD-US-CSE-Benchmark/Data/1-Data-Wrangling")

In [3]:
acceptable_flags = [0, 5, 10, 19, 23, 25, 32, 43, 47, 51, 52]

## **Wrangling `ME_NTL_2D.parquet`**

In [4]:
me_ntl = pd.read_parquet("ME_NTL_2D.parquet")
me_ntl

variable,datetime,depth,flag,chla_ugl,dic,do,doc,drp,nh4,no3no2,par,poc,temp,tn,tp
0,1995-05-09 12:00:00+00:00,0.0,32,,,13.2,,,,632.0,,,7.8,,0.084
1,1995-05-09 12:00:00+00:00,0.0,46,,,,,,0.079,,,,,,
2,1995-05-09 12:00:00+00:00,1.0,32,,,13.1,,,,,,,7.8,,
3,1995-05-09 12:00:00+00:00,2.0,32,,,13.2,,,,,,,7.8,,
4,1995-05-09 12:00:00+00:00,3.0,32,,,13.2,,,,,,,7.8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14794,2023-09-18 12:00:00+00:00,-99.0,46,6.0,,,,,,,,,,,
14795,2023-10-02 12:00:00+00:00,-99.0,46,6.9,,,,,,,,,,,
14796,2023-10-23 12:00:00+00:00,-99.0,46,5.0,,,,,,,,,,,
14797,2023-11-01 12:00:00+00:00,-99.0,46,2.8,,,,,,,,,,,


## **Filtering**

Since we selected observations from a depth of 1.0 meters from the 
high-frequency data, we need to do the same for the low-frequency data.

In [7]:
me_ntl = me_ntl[(me_ntl["depth"] == 1.0) & (me_ntl["flag"].isin(acceptable_flags))]
me_ntl = me_ntl.drop(["depth", "flag"], axis = 1)
me_ntl

variable,datetime,chla_ugl,dic,do,doc,drp,nh4,no3no2,par,poc,temp,tn,tp
2,1995-05-09 12:00:00+00:00,,,13.1,,,,,,,7.8,,
28,1995-05-23 12:00:00+00:00,,,9.2,,,,,,,14.0,,
54,1995-06-06 12:00:00+00:00,,,10.1,,,,,,,18.7,,
80,1995-06-21 12:00:00+00:00,,,15.6,,,,,,,27.4,,
98,1995-07-06 12:00:00+00:00,,,8.0,,,,,,,21.5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14637,2022-09-06 12:00:00+00:00,,,9.4,,,,,,,22.8,,
14670,2022-09-19 12:00:00+00:00,,,11.0,,,,,,,21.5,,
14698,2022-10-04 12:00:00+00:00,,,12.7,,,,,,,17.7,,
14725,2022-10-20 12:00:00+00:00,,,8.3,,,,,,,12.0,,


Let's check the percentage of missing values for each variable.

In [8]:
(me_ntl.isna().sum() / len(me_ntl)) * 100

variable
datetime      0.000000
chla_ugl    100.000000
dic         100.000000
do            0.904977
doc          98.190045
drp          98.642534
nh4          98.416290
no3no2       98.642534
par          89.819005
poc         100.000000
temp          0.452489
tn           99.773756
tp           98.416290
dtype: float64

Some of the variable are missing all of their values. We will omit them.

In [9]:
me_ntl = me_ntl.drop(["chla_ugl", "dic", "poc"], axis = 1)
me_ntl

variable,datetime,do,doc,drp,nh4,no3no2,par,temp,tn,tp
2,1995-05-09 12:00:00+00:00,13.1,,,,,,7.8,,
28,1995-05-23 12:00:00+00:00,9.2,,,,,,14.0,,
54,1995-06-06 12:00:00+00:00,10.1,,,,,,18.7,,
80,1995-06-21 12:00:00+00:00,15.6,,,,,,27.4,,
98,1995-07-06 12:00:00+00:00,8.0,,,,,,21.5,,
...,...,...,...,...,...,...,...,...,...,...
14637,2022-09-06 12:00:00+00:00,9.4,,,,,,22.8,,
14670,2022-09-19 12:00:00+00:00,11.0,,,,,,21.5,,
14698,2022-10-04 12:00:00+00:00,12.7,,,,,,17.7,,
14725,2022-10-20 12:00:00+00:00,8.3,,,,,,12.0,,


## **Timescale Correction**

The time series skips datetimes. We will need to correct the timescale to 
introduce the missing timesteps.

In [10]:
me_ntl_date_range = pd.DataFrame({"datetime": pd.date_range(me_ntl["datetime"].min(), me_ntl["datetime"].max(), freq='D').normalize() + pd.Timedelta(hours = 12)})
me_ntl = me_ntl.merge(me_ntl_date_range, on = "datetime", how = "outer")
me_ntl

Unnamed: 0,datetime,do,doc,drp,nh4,no3no2,par,temp,tn,tp
0,1995-05-09 12:00:00+00:00,13.1,,,,,,7.8,,
1,1995-05-10 12:00:00+00:00,,,,,,,,,
2,1995-05-11 12:00:00+00:00,,,,,,,,,
3,1995-05-12 12:00:00+00:00,,,,,,,,,
4,1995-05-13 12:00:00+00:00,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
10034,2022-10-28 12:00:00+00:00,,,,,,,,,
10035,2022-10-29 12:00:00+00:00,,,,,,,,,
10036,2022-10-30 12:00:00+00:00,,,,,,,,,
10037,2022-10-31 12:00:00+00:00,,,,,,,,,


In [11]:
(me_ntl.isna().sum() / len(me_ntl)) * 100

datetime     0.000000
do          95.637016
doc         99.920311
drp         99.940233
nh4         99.930272
no3no2      99.940233
par         99.551748
temp        95.617093
tn          99.990039
tp          99.930272
dtype: float64

In [12]:
me_ntl.to_parquet("./ME_LF_Cleaned.parquet")