In [2]:
import xarray
import os
import pandas as pd
from Preprocessing import *

extractor = FileExtractor()

df_dwd_hornsea = extractor.combine_files("data", "dwd_icon_eu_hornsea")
df_dwd_pes = extractor.combine_files("data", "dwd_icon_eu_pes10")
df_dwd_demand = extractor.combine_files("data", "dwd_icon_eu_demand")

ncep_gfs_hornsea = extractor.combine_files("data", "ncep_gfs_hornsea")
ncep_gfs_pes = extractor.combine_files("data", "ncep_gfs_pes10")
ncep_gfs_demand = extractor.combine_files("data", "ncep_gfs_demand")

In [3]:
ncep_gfs_hornsea_safe = ncep_gfs_hornsea

In [None]:
pd.options.display.float_format = '{:.2f}'.format
df_dwd_hornsea.describe()

In [None]:
df_dwd_pes.describe()

In [None]:
ncep_gfs_hornsea.describe()

In [None]:
ncep_gfs_pes.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 6))
sns.boxplot(ncep_gfs_hornsea)
plt.xlabel("Feature")
plt.grid(True)

plt.show


plt.figure(figsize=(14, 6))
sns.boxplot(ncep_gfs_pes[["Temperature", "CloudCover"]])
plt.xlabel("Feature")
plt.grid(True)

plt.show

In [None]:
ncep_gfs_pes[ncep_gfs_pes.SolarDownwardRadiation >= 1000 ]

In [69]:

def build_forecasting_horizon(df):
    df.reset_index(inplace = True)
    # rename the columns properly
    df = df.rename(columns = {"level_0": "reference_time", "level_1": "valid_time"})
    if "index" in df.columns:
        df.drop(columns = ["index"], axis = 1, inplace = True)
    # convert the datetime information to the right format
    df["reference_time"] = pd.to_datetime(df.reference_time).dt.tz_localize("UTC")
    df["forecast_horizon"] = df["valid_time"]
    df["valid_time"] = df["reference_time"] + pd.to_timedelta(df["valid_time"], unit = "hour")
    # remove forecasts which extend beyond the day ahead, since they will be outdated the next day anyway
    return df

test = build_forecasting_horizon(ncep_gfs_hornsea_safe)

In [70]:
test = test[(test["valid_time"] - test["reference_time"]).div(pd.Timedelta("1h")) < 50]

In [None]:
test[(test["valid_time"] - test["reference_time"]).div(pd.Timedelta("1h")) < 50].isna().sum()

In [None]:
test.isna().sum()

In [None]:
ncep_gfs_hornsea.isna().sum()

In [None]:
import missingno as msno

msno.bar(test)

In [None]:
test.isna().any(axis=1)

In [None]:
nan_df = test[test.isna().any(axis=1)]
nan_df['lat_lon_combination'] = nan_df['latitude'].astype(str) + '_' + nan_df['longitude'].astype(str)

In [None]:
dl = test[(test['valid_time'].dt.date == pd.to_datetime('2021-03-26').date()) | (test['valid_time'].dt.date == pd.to_datetime('2021-03-27').date())]
dl['lat_lon_combination'] = dl['latitude'].astype(str) + '_' + dl['longitude'].astype(str)

In [None]:
dl[(dl.valid_time >= "2021-03-26 18:00:00+00:00") & (dl.forecast_horizon <= 10)].iloc[30:90]

In [None]:
nan_df.iloc

In [None]:
len(nan_df.valid_time.unique())

In [None]:
nan_df.head(60)

In [None]:
def handle_missing_data(df):
    group_cols = ['reference_time', 'valid_time', 'latitude', 'longitude']
    df_interpolated = df.groupby(group_cols).apply(lambda group: group.interpolate(method='index'))
    return df_interpolated

test_cleaned = test.interpolate(method='linear')
test_cleaned

In [24]:
group_test = test.copy()
group_test['lat_lon_combination'] = group_test['latitude'].astype(str) + '_' + group_test['longitude'].astype(str)
group_test = group_test.groupby(["reference_time", "lat_lon_combination","valid_time"], as_index=False).mean()

# group_test_ip = group_test.interpolate(method=)

In [25]:
sort_test = test.copy()
sort_test['lat_lon_combination'] = sort_test['latitude'].astype(str) + '_' + sort_test['longitude'].astype(str)
sort_test = sort_test.sort_values(by=["forecast_horizon", "lat_lon_combination", "valid_time"])

In [None]:
sort_test.isna().sum()

In [None]:
sort_test[(sort_test.valid_time >= "2021-03-25 00:00:00+00:00") & (sort_test.valid_time <= "2021-03-28 00:00:00+00:00")].head(60)

In [None]:
import plotly.express as px
df_with_nans1 = test[(test.valid_time >= "2021-03-24 00:00:00+00:00") & (test.valid_time <= "2021-03-28 00:00:00+00:00")]

df_with_nans1['lat_lon_combination'] = df_with_nans1['latitude'].astype(str) + '_' + df_with_nans1['longitude'].astype(str)
px.line(df_with_nans1[df_with_nans1.forecast_horizon == 1], x="valid_time", y="Temperature", color="lat_lon_combination")

In [None]:
def test_interpolation(sort_test):
    sort_test['lat_lon_combination'] = sort_test['latitude'].astype(str) + '_' + sort_test['longitude'].astype(str)
    # sort_test = sort_test.sort_values(by=["forecast_horizon", "lat_lon_combination", "valid_time"])

    cols_with_nan = sort_test.columns[sort_test.isna().any()].tolist()
    # sort_test_interpolated = sort_test.groupby(['forecast_horizon', 'lat_lon_combination']).apply(lambda group: group.interpolate())
    #cols_to_interpolate = ['RelativeHumidity', 'Temperature', 'WindDirection', 'WindDirection:100', 'WindSpeed', 'WindSpeed:100']  # Passen Sie diese Liste an

    # Anwenden der Interpolation über .transform() für jede Gruppe
    sort_test[cols_with_nan] = sort_test.groupby(['forecast_horizon', 'lat_lon_combination'])[cols_with_nan].transform(lambda group: group.interpolate(method='linear'))
    return sort_test

test3 = test.copy()
test_cleaned_3 = handle_missing_data(test3)


df_with_nans1 = test_cleaned_3[(test_cleaned_3.valid_time >= "2021-03-24 00:00:00+00:00") & (test_cleaned_3.valid_time <= "2021-03-28 00:00:00+00:00")]

# df_with_nans1['lat_lon_combination'] = df_with_nans1['latitude'].astype(str) + '_' + df_with_nans1['longitude'].astype(str)
px.line(df_with_nans1[df_with_nans1.forecast_horizon == 0], x="valid_time", y="Temperature", color="lat_lon_combination")

In [None]:
import plotly.express as px
df_with_nans = test_cleaned[(test_cleaned.valid_time >= "2021-03-24 00:00:00+00:00") & (test_cleaned.valid_time <= "2021-03-28 00:00:00+00:00")]

df_with_nans['lat_lon_combination'] = df_with_nans['latitude'].astype(str) + '_' + df_with_nans['longitude'].astype(str)
px.line(df_with_nans[df_with_nans.forecast_horizon == 3], x="valid_time", y="Temperature", color="lat_lon_combination")

In [None]:
test_ll = test.copy()
test_ll['lat_lon_combination'] = test_ll['latitude'].astype(str) + '_' + test_ll['longitude'].astype(str)
test_ll.groupby(["valid_time", "lat_lon_combination"]).transform("mean").isna().sum()

In [None]:
def handle_missing_data(df):

        # Remove data points with at least 80% of the features containing missing values.
        #df = df[df.isna().sum(axis=1) <= 0.8]
        df['lat_lon_combination'] = df['latitude'].astype(str) + '_' + df['longitude'].astype(str)
        # Fill missing values by using the mean of other data points at a similiar time (same year, month and hour)
        mask = df.isna().any(axis=1)
        # Group by year, month, and hour, then calculate the mean
        grouped_means = df.groupby([df.valid_time.dt.year, df.valid_time.dt.month, df.valid_time.dt.hour, df.lat_lon_combination]).transform('mean')
        # Fill missing values using the grouped means
        df[mask] = df[mask].fillna(grouped_means)
        
        return df

test2 = test.copy()
test_cleaned_2 = handle_missing_data(test2)

import plotly.express as px
df_with_nans = test_cleaned_2[(test_cleaned_2.valid_time >= "2021-03-24 00:00:00+00:00") & (test_cleaned_2.valid_time <= "2021-03-28 00:00:00+00:00")]

# df_with_nans['lat_lon_combination'] = df_with_nans['latitude'].astype(str) + '_' + df_with_nans['longitude'].astype(str)
px.line(df_with_nans[df_with_nans.forecast_horizon == 0], x="valid_time", y="Temperature", color="lat_lon_combination")
