In [1]:
import pandas as pd
import numpy as np

# Read South Foehn datasets and stack

In [14]:
location="ALT"

In [15]:
df_1981_1990 = pd.read_csv(f"data/FoehnData/{location}_1981_1990.dat", delimiter="\t", header=1)
df_1981_1990["date"] = pd.to_datetime(df_1981_1990["Datumsangabe_Zeit"], format='%Y%m%d %H:%M')
df_1981_1990 = df_1981_1990[["date", "Foehn"]]

In [16]:
df_1983_2019 = pd.read_csv(f"data/FoehnData/{location}_1983_2019.dat", delimiter="|", skiprows=1, header=0, names=range(1,10), low_memory=False)
df_1983_2019["date"] = pd.to_datetime(df_1983_2019[2], format='%Y%m%d%H%M%S')
df_1983_2019["Foehn"] = df_1983_2019[4]
df_1983_2019 = df_1983_2019[["date", "Foehn"]]

In [17]:
# Stack dataframes. Take the years 1981 and 1982 from df_1981_1990 and the remainder from df_1983_2019
df_1981_2019 = pd.concat([df_1981_1990.loc[df_1981_1990["date"] < np.datetime64("1983-01-01 00:00:00"),:], df_1983_2019], axis=0, ignore_index=True)

# Drop NaT values (due to some blank rows inn 1983-2019 dataframe)
df_1981_2019.dropna(inplace=True, subset=["date"])

In [18]:
# Ensure continuoes and consistent values in the "date" column
df_timeframe = pd.Series(pd.date_range(start = "1981-01-01 00:00:00", end="2019-12-31 23:50:00", freq="10min"), name="date")
df_1981_2019 = pd.merge(df_timeframe, df_1981_2019, on="date", how="left", validate="one_to_one")

# Preprocess data

In [19]:
# Set all values in Foehn which are larger than 2 to np.NaN
df_1981_2019["Foehn"] = df_1981_2019["Foehn"].mask(df_1981_2019["Foehn"] >2)

# Set all Foehn values (Foehn==2) to 1 (i.e. treat Mischluft as normal foehn)
df_1981_2019["Foehn"] = df_1981_2019["Foehn"].mask(df_1981_2019["Foehn"] == 2.0, 1.0)

In [20]:
# Define rolling window of length 6. If at least 4 dates show Foehn say there is foehn prevalent, otherwise not (refer to Gutermann et. al.(2013))
# Allow max 2 missing values (-> min_periods=4). Otherwise set entry to np.NaN
foehn_rolling_window=df_1981_2019["Foehn"].rolling(window=6, min_periods=4).sum().shift(-3)
foehn_new_representation = (foehn_rolling_window >= 4).astype(int)
foehn_new_representation.loc[foehn_rolling_window.isnull()] = np.NaN

df_rolling= df_1981_2019.copy()
df_rolling["Foehn"]= foehn_new_representation

In [21]:
# Keep only timestamps at full hour and where hour equals 0,6,12 or 18
date_mask = (df_rolling["date"].dt.minute==0) & \
            ((df_rolling["date"].dt.hour == 0) | 
            (df_rolling["date"].dt.hour == 6) |
            (df_rolling["date"].dt.hour == 12) |
            (df_rolling["date"].dt.hour == 18))
df_foehn =df_rolling.loc[date_mask]

In [22]:
# Sanity check
print(df_foehn["Foehn"].value_counts(normalize=True))
display(df_foehn)

0.0    0.946289
1.0    0.053711
Name: Foehn, dtype: float64


Unnamed: 0,date,Foehn
0,1981-01-01 00:00:00,
36,1981-01-01 06:00:00,
72,1981-01-01 12:00:00,
108,1981-01-01 18:00:00,
144,1981-01-02 00:00:00,
...,...,...
2050956,2019-12-30 18:00:00,0.0
2050992,2019-12-31 00:00:00,0.0
2051028,2019-12-31 06:00:00,0.0
2051064,2019-12-31 12:00:00,0.0


# Write output

In [23]:
df_foehn.to_csv(f"data/FoehnData/{location}_foehn.csv", index=False)