In [4]:
import os

import numpy as np
import pandas as pd

random_seed = 1
np.random.seed(random_seed)

In [5]:
stations_df = pd.read_csv('./data/stations.csv')
stations_dict = stations_df.groupby(['common_id']).first().to_dict('index')
stations_dict['2386-ch']['lower_limit'] = 38_000.0
stations_dict['2386-ch']['upper_limit'] = 40_000.0
stations_dict['2720050000-de']['lower_limit'] = 20.0
stations_dict['2720050000-de']['upper_limit'] = 500.0
stations_dict['36022-ie']['lower_limit'] = 20.0
stations_dict['36022-ie']['upper_limit'] = 175.0
stations_dict['39003-ie']['lower_limit'] = 10.0
stations_dict['39003-ie']['upper_limit'] = 225.0
stations_dict['42960105-de']['lower_limit'] = -10.0
stations_dict['42960105-de']['upper_limit'] = 275.0
stations_dict['auto-1003803']['lower_limit'] = -10.0
stations_dict['auto-1003803']['upper_limit'] = 275.0

In [6]:
for common_id, station_dict in stations_dict.items():
    fp = f'./data/classified_raw/{common_id}_outliers_classified.parquet'
    if not os.path.exists(fp):
        continue
    raw_classified_df = pd.read_parquet(fp)
    print(f'Removing for {common_id}, lower limit {station_dict["lower_limit"]}, upper limit {station_dict["upper_limit"]}')
    print('Removing following rows:')
    mask = (raw_classified_df['water_level'] < station_dict['lower_limit']) | (raw_classified_df['water_level'] > station_dict['upper_limit'])
    print(raw_classified_df[mask])
    raw_classified_df[~mask].to_parquet(f'././data/classified/{common_id}_outliers_classified.parquet')
    print()


Removing for 2386-ch, lower limit 38000.0, upper limit 40000.0
Removing following rows:
       water_level                 timestamp  is_outlier
28266     63902.70 2019-08-22 06:00:00+00:00        True
28412       600.00 2019-08-28 09:00:00+00:00        True
30778    222200.00 2019-12-06 10:00:00+00:00        True
32183       800.00 2020-02-03 13:00:00+00:00        True
32774        39.04 2020-02-28 04:00:00+00:00        True
33093     15024.00 2020-03-16 13:00:00+00:00        True
36882       587.00 2020-08-26 15:00:00+00:00        True

Removing for 2720050000-de, lower limit 20.0, upper limit 500.0
Removing following rows:
       water_level                 timestamp  is_outlier
26845         19.0 2019-08-21 09:00:00+00:00        True
26857          3.1 2019-08-21 21:00:00+00:00        True
27025     332881.0 2019-08-28 21:00:00+00:00        True
27045      33546.0 2019-08-29 17:00:00+00:00        True
28539     470470.0 2019-10-31 01:00:00+00:00        True
29810       7272.0 2019-