In [3]:
import pandas as pd
import numpy as np

# Read North Foehn datasets and stack

In [35]:
location="PIO" # "PIO" or "LUG"

In [36]:
df_1983_2016 = pd.read_csv(f"data/FoehnData/{location}_1983_2016.txt", delimiter=",", header=0)
df_1983_2016["Date"] = pd.to_datetime(df_1983_2016["Date"], format='%Y%m%d%H%M')
df_1983_2016.rename(columns= {"Date": "date", "Foehn Index": "Foehn"}, inplace=True)

In [37]:
df_2017_2019 = pd.read_csv(f"data/FoehnData/{location}_2017_2019.txt", delimiter=";", skiprows=2, header=0)
df_2017_2019["time"] = pd.to_datetime(df_2017_2019["time"], format='%Y%m%d%H%M')
df_2017_2019.rename(columns= {"time": "date", "wcc006s0": "Foehn"}, inplace=True)
df_2017_2019 = df_2017_2019[["date", "Foehn"]]

In [38]:
# Stack dataframes.
df_1983_2019 = pd.concat([df_1983_2016, df_2017_2019], axis=0, ignore_index=True)

In [39]:
# Ensure continuoes and consistent values in the "date" column
df_timeframe = pd.Series(pd.date_range(start = "1983-01-01 00:00:00", end="2019-12-31 23:50:00", freq="10min"), name="date")
df_1983_2019 = pd.merge(df_timeframe, df_1983_2019, on="date", how="left", validate="one_to_one")

# Preprocess data

In [40]:
# Remove "-" from Foehn column and transform type to float
df_1983_2019["Foehn"] = df_1983_2019["Foehn"].mask(df_1983_2019["Foehn"] =="-", np.NaN)
df_1983_2019["Foehn"] = df_1983_2019["Foehn"].astype(float)

# Set all Foehn values (Foehn==2) to 1 (i.e. treat Mischluft as normal foehn)
df_1983_2019["Foehn"] = df_1983_2019["Foehn"].mask(df_1983_2019["Foehn"] == 2.0, 1.0)

In [41]:
# Define rolling window of length 6. If at least 4 dates show Foehn say there is foehn prevalent, otherwise not (refer to Gutermann et. al.(2013))
# Allow max 2 missing values (-> min_periods=4). Otherwise set entry to np.NaN
foehn_rolling_window=df_1983_2019["Foehn"].rolling(window=6, min_periods=4).sum().shift(-3)
foehn_new_representation = (foehn_rolling_window >= 4).astype(int)
foehn_new_representation.loc[foehn_rolling_window.isnull()] = np.NaN

df_rolling= df_1983_2019.copy()
df_rolling["Foehn"]= foehn_new_representation

In [42]:
# Keep only timestamps at full hour and where hour equals 0,6,12 or 18
date_mask = (df_rolling["date"].dt.minute==0) & \
            ((df_rolling["date"].dt.hour == 0) | 
            (df_rolling["date"].dt.hour == 6) |
            (df_rolling["date"].dt.hour == 12) |
            (df_rolling["date"].dt.hour == 18))
df_foehn =df_rolling.loc[date_mask]

In [43]:
# Sanity check
df_foehn["Foehn"].value_counts(normalize=True)

0.0    0.862258
1.0    0.137742
Name: Foehn, dtype: float64

# Write output

In [44]:
df_foehn.to_csv(f"data/FoehnData/{location}_foehn.csv", index=False)