In [1]:
import pandas as pd
import numpy as np

import datetime as dt

pd.set_option('display.max_rows', None)

PTYPES = ['UP','RA','FZRA','SOLID']

In [2]:
## TODO: generate target event summary from events

def gen_ptype(ptype):
    """Return dictionary of precipitation types"""
    return {pcode: True  if pcode in ptype else False for pcode in PTYPES}

def create_event(start, duration, ptype):
    """Creates a test event"""
    index = pd.date_range(start, freq="H", periods=duration)
    return pd.DataFrame(
        [gen_ptype(pt) for pt in ptype],
        index=index,
    )

def create_test_series(start_time, periods, events):
    """Creates a test series for identifying precip events"""
    time_index = pd.date_range(start_time, periods=periods, freq='h')
    return pd.concat([create_event(*event) for event in events]).reindex(time_index)

start_time = '2024-03-25'
periods = 7*24
events = [
    ('2024-03-26 06:00:00', 1, ["SOLID"]),
    ('2024-03-28 12:00:00', 2, ["RA", "SOLID"]),
    ('2024-03-28 15:00:00', 1, ["SOLID"]),
    ('2024-03-29 12:00:00', 1, ["FZRA"]),
    ('2024-03-29 14:00:00', 1, ["UP"]),
    ('2024-03-29 16:00:00', 1, ["SOLID"]),
    ('2024-03-29 22:00:00', 6, ["SOLID", "SOLID", "RA", ["SOLID", "RA"], "RA", "SOLID"]),
    ('2024-03-30 20:00:00', 4, ["RA", "FZRA", "SOLID", "SOLID"]),
]

test_df = create_test_series(start_time, periods, events)
test_df

Unnamed: 0,UP,RA,FZRA,SOLID
2024-03-25 00:00:00,,,,
2024-03-25 01:00:00,,,,
2024-03-25 02:00:00,,,,
2024-03-25 03:00:00,,,,
2024-03-25 04:00:00,,,,
2024-03-25 05:00:00,,,,
2024-03-25 06:00:00,,,,
2024-03-25 07:00:00,,,,
2024-03-25 08:00:00,,,,
2024-03-25 09:00:00,,,,


### Create expected events dataset

In [3]:
# expected_events = pd.DataFrame()
expected_events_dict = []
for id, (start, duration, _) in enumerate(events):
    expected_events_dict.append(
        {
            'start': pd.to_datetime(start),
            'end': pd.date_range(start, freq='h', periods=duration)[-1],
            'duration': duration,
        }
    )
expected_events = pd.DataFrame(expected_events_dict)
expected_events

Unnamed: 0,start,end,duration
0,2024-03-26 06:00:00,2024-03-26 06:00:00,1
1,2024-03-28 12:00:00,2024-03-28 13:00:00,2
2,2024-03-28 15:00:00,2024-03-28 15:00:00,1
3,2024-03-29 12:00:00,2024-03-29 12:00:00,1
4,2024-03-29 14:00:00,2024-03-29 14:00:00,1
5,2024-03-29 16:00:00,2024-03-29 16:00:00,1
6,2024-03-29 22:00:00,2024-03-30 03:00:00,6
7,2024-03-30 20:00:00,2024-03-30 23:00:00,4


To find "runs" use the following steps:
1. Convert data to booleans
2. Calculate the cumulative sum of _first differences_.  Booleans are treated as 1 and 0 for integer operations.

In [4]:
test_df["PRECIP"] = test_df[['UP','RA','FZRA','SOLID']].any(axis=1)
test_df['event'] = (test_df["PRECIP"].diff(1) != 0).cumsum()
test_df

Unnamed: 0,UP,RA,FZRA,SOLID,PRECIP,event
2024-03-25 00:00:00,,,,,False,1
2024-03-25 01:00:00,,,,,False,1
2024-03-25 02:00:00,,,,,False,1
2024-03-25 03:00:00,,,,,False,1
2024-03-25 04:00:00,,,,,False,1
2024-03-25 05:00:00,,,,,False,1
2024-03-25 06:00:00,,,,,False,1
2024-03-25 07:00:00,,,,,False,1
2024-03-25 08:00:00,,,,,False,1
2024-03-25 09:00:00,,,,,False,1


In [5]:
def get_metrics(x):
    return {
        'start': x.index[0],
        'end': x.last().index,
        'duration': len(x),
    }

def event_start(x):
    return x.index[0]

def event_end(x):
    return x.index[-1]

def duration(x):
    return len(x)

In [6]:
prec_group = test_df[test_df.PRECIP].groupby('event')

In [13]:
events_result = pd.DataFrame(
    {
        "start": prec_group.apply(event_start),
        "end": prec_group.apply(event_end),
        "duration": prec_group.apply(duration),
    }
).reset_index().drop('event', axis=1)
events_result

Unnamed: 0,start,end,duration
0,2024-03-26 06:00:00,2024-03-26 06:00:00,1
1,2024-03-28 12:00:00,2024-03-28 13:00:00,2
2,2024-03-28 15:00:00,2024-03-28 15:00:00,1
3,2024-03-29 12:00:00,2024-03-29 12:00:00,1
4,2024-03-29 14:00:00,2024-03-29 14:00:00,1
5,2024-03-29 16:00:00,2024-03-29 16:00:00,1
6,2024-03-29 22:00:00,2024-03-30 03:00:00,6
7,2024-03-30 20:00:00,2024-03-30 23:00:00,4


In [17]:
assert expected_events.equals(events_result)

## Test on real data

In [None]:
from ros_database.filepath import SURFOBS_HOURLY_PATH
from ros_database.processing.surface import load_station_combined_data

In [None]:
hourly = list(SURFOBS_HOURLY_PATH.glob("*.csv"))

In [None]:
df = load_station_combined_data(hourly[0])

In [None]:
df.head()

### Identify precip events

`TRUE` for `UP`, `RA`, `FZRA`, or `SOLID`

In [None]:
df['PRECIP'] = df[['UP','RA','FZRA','SOLID']].any(axis=1)
df.head()

In [None]:
df['PRECIP'].sum()

In [None]:
### Create cumulative counts of PRECIP

In [None]:
df['cumsum'] = (df['PRECIP'].diff(1) != 0).cumsum()
df.head()

In [None]:
df['cumsum'].plot()

In [None]:
prec_group = df[df.PRECIP].groupby('cumsum')

In [None]:
def count_ptype(x, ptype):
    """Return counts of precipitation type"""
    return x[ptype].sum()

def t2m_mean(x):
    return x['t2m'].mean()

def t2m_min(x):
    return x['t2m'].min()

def t2m_max(x):
    return x['t2m'].max()

In [None]:
events = pd.DataFrame({
    "start": prec_group.apply(event_start),
    "end": prec_group.apply(event_end),
    "duration": prec_group.apply(duration),
    "RA": prec_group.apply(count_ptype, "RA"),
    "UP": prec_group.apply(count_ptype, "UP"),
    "FZRA": prec_group.apply(count_ptype, "FZRA"),
    "SOLID": prec_group.apply(count_ptype, "SOLID"),
    "t2m_mean": prec_group.apply(t2m_mean),
    "t2m_min": prec_group.apply(t2m_min),
    "t2m_max": prec_group.apply(t2m_max),
})

In [None]:
events

#### Test pandas sum on booleans

In [None]:
import 

In [None]:
test = pd.DataFrame(
    {
        "UP": [True, True, False, True], 
        "RA": [False, False, True, True], 
        "event": [1, 1, 2, 4]
    },
    index=[0, 1, 2, 3],
)
test

In [None]:
def test_count_ptype(x, ptype):
    return x[ptype].sum()

In [None]:
pd.DataFrame(
    {
        "UP": test.groupby(test.event).apply(test_count_ptype, "UP"),
        "RA": test.groupby(test.event).apply(test_count_ptype, "RA"),
    }
)

In [None]:
bins = np.arange(1,72)
events['duration'].hist(bins=bins)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(2, 1, sharey=True)

ax[0].scatter(df.index.day_of_year, df.t2m, s=2, c='0.7')
ax[0].axhline(0., c='0.2')

dfday = df.t2m.resample('D').mean()
ax[1].scatter(dfday.index.day_of_year, dfday, s=2, c='0.7', alpha=0.5)
dfday.groupby(dfday.index.day_of_year).quantile(0.95).plot(ax=ax[1])
ax[1].axhline(0., c='0.4')

In [None]:
dfday.groupby(dfday.index.day_of_year).quantile(0.95)

In [None]:
df['2023-11-01':'2023-11-30']

In [None]:
stations_with_precip = []
for f in hourly:
    df = load_station_combined_data(f)
    if df.p01i.any():
        stations_with_precip.append(f)

In [None]:
print(len(stations_with_precip))
print(len(hourly))

In [None]:
stations_with_precip[:5]

In [None]:
df = load_station_combined_data(stations_with_precip[1])

In [None]:
df.head()

In [None]:
df['p01i'].describe()

In [None]:
df.p01i[df.p01i > 0.].hist(bins=np.arange(0,10,0.5))

In [None]:
np.arange(0,20)

In [None]:
df.p01i.plot()

In [None]:
df[df.p01i > 0.].count()

In [None]:
df.p01i.hist(bins=np.arange(0.,df.p01i.max()))