In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from collections import defaultdict
from functools import reduce
import os

import seaborn as sns
import matplotlib.pyplot as plt

import pickle

In [2]:
#Eingabe Parameter

In [3]:
längeAnalyse= 10

startZeit= pd.to_datetime("2014-01-01 00:00")
endZeit= pd.to_datetime("2023-12-31 23:00")

#10% unproblematisch, 30% warscheinlich zuviel
maxShareMissingValues= 0.1

features= ['TT_TU', 'RF_TU', '  R1', '  P0', '   F']
stationFeatures= ['MESS_DATUM', 'STATIONS_ID', 'Stationshoehe', 'geoBreite', 'geoLaenge', 'Stationsname']

In [4]:
hourVarying= ['TT_TU', 'RF_TU', '  R1', 'RS_IND', 'WRTR', '   P', '  P0', '   F', '   D']
unVarying= ['STATIONS_ID', 'Stationshoehe', 'geoBreite', 'geoLaenge', 'Stationsname', 'day', 'month', 'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos']

In [5]:
#realistische Wertebereich

#Lufttemperatur -38 -- 45 ('TT_TU')
#Relative Luftfeuchtigkeit 0 -- 100 ('RF_TU')
#Niederschlagshöhe 0 -- 245 ('  R1')
#Niederschlags Indikator 0 -- 1 ('RS_IND')
#Niederschlags Form 0 -- 9 ('WRTR')
#Luftdruck Meereshöhe 800 -- 1100 ('   P')
#Luftdruck 800 -- 1100 ('  P0')
#Windgeschwindigkeit 0 -- 350 ('   F')
#Windrichtung 0 -- 360 ('   D')

In [6]:
werteBereicheDictionary= {
    'TT_TU': [-38, 45],
    'RF_TU': [0, 100],
    '  R1': [0, 245],
    'RS_IND': [0, 1],
    'WRTR': [0, 9],
    '   P': [800, 1100],
    '  P0': [800, 1100],
    '   F': [0, 350],
    '   D': [0, 360]
}


In [7]:
#Imputation Methoden
#                           1h - 3h         3h - 1 d                    1d - 3d                     > 3d
#TT_TU(Tempreature)         Forward Fill    Zeitbasierte Interpolation  Zeitbasierte Interpolation  Lineare Regression
#RF_TU(Relative Humidity)   Forward Fill    Zeitbasierte Interpolation  Zeitbasierte Interpolation  Random Forest Imputation
#R1(Percipation)            FillNA(0)       Zeitbasierte Interpolation  Saisonale Mittelwerte       Lineare Regression
#P0(Presure)                Forward Fill    Zeitbasierte Interpolation  Zeitbasierte Interpolation  Lineare Regression
#F(Windspeed)               Forward Fill    Zeitbasierte Interpolation  Zeitbasierte Interpolation  Random Forest Imputation

In [8]:
independendMeasurments= [("TT_TU", 1, 3, "Forward Fill"), ("RF_TU", 1, 3, "Forward Fill"), ("  R1", 1, 3, "FillNA(0)"), ("  P0", 1, 3, "Forward Fill"), ("   F", 1, 3, "Forward Fill"), ("TT_TU", 4, 72, "Zeitbasierte Interpolation"), ("RF_TU", 4, 72, "Zeitbasierte Interpolation"), ("  R1", 4, 24, "Zeitbasierte Interpolation"), ("  P0", 4, 72, "Zeitbasierte Interpolation"), ("   F", 4, 72, "Zeitbasierte Interpolation"), ("  R1", 25, 72, "Saisonale Mittelwerte")]
dependendMeasurments= [("TT_TU", 73, -1, "Lineare Regression"), ("RF_TU", 73, -1, "Random Forest Imputation"), ("  R1", 73, -1, "Lineare Regression"), ("  P0", 73, -1, "Lineare Regression"), ("   F", 73, -1, "Random Forest Imputation")]

In [9]:
#Files Einlesen (Dictionary pro Typ und Geodaten einzeln)

In [10]:
folder_path = "..\data\TU"

txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

dataframesTU = {}

for file_name in txt_files:
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path, sep= ";")

    #Name creation
    file_base = os.path.splitext(file_name)[0]
    parts = file_base.split("_")
    key = f"{parts[1]}_{parts[-1]}"

    dataframesTU[key] = df

In [11]:
folder_path = "..\data\RR"

txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

dataframesRR = {}

for file_name in txt_files:
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path, sep= ";")

    #Name creation
    file_base = os.path.splitext(file_name)[0]
    parts = file_base.split("_")
    key = f"{parts[1]}_{parts[-1]}"

    dataframesRR[key] = df

In [12]:
folder_path = "..\data\P0"

txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

dataframesP0 = {}

for file_name in txt_files:
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path, sep= ";")

    #Name creation
    file_base = os.path.splitext(file_name)[0]
    parts = file_base.split("_")
    key = f"{parts[1]}_{parts[-1]}"

    dataframesP0[key] = df

In [13]:
folder_path = "..\data\FF"

txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

dataframesFF = {}

for file_name in txt_files:
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path, sep= ";")

    #Name creation
    file_base = os.path.splitext(file_name)[0]
    parts = file_base.split("_")
    key = f"{parts[1]}_{parts[-1]}"

    dataframesFF[key] = df

In [14]:
dataDictionary= {
    "TU": dataframesTU,
    "RR": dataframesRR,
    "P0": dataframesP0,
    "FF": dataframesFF
}

In [15]:
columnNames = ["Stations_id", "von_datum", "bis_datum", "Stationshoehe","geoBreite", "geoLaenge", "Stationsname", "Bundesland", "Abgabe"]

methaDataTUtxt= pd.read_fwf("..\data\Metha List\TU_Stundenwerte_Beschreibung_Stationen.txt", encoding="iso-8859-1", skiprows=2, names=columnNames)

methaDataTU= pd.DataFrame(methaDataTUtxt)

methaDataTU= methaDataTU.drop(columns= ["von_datum", "bis_datum", "Bundesland", "Abgabe"])

In [16]:
#Datatype conversion

In [17]:
for dataKey, dataFrames in dataDictionary.items():
    for key, frames in dataDictionary[dataKey].items():
        dataDictionary[dataKey][key]['MESS_DATUM']= pd.to_datetime(frames['MESS_DATUM'].astype(str), format= '%Y%m%d%H')
        dataDictionary[dataKey][key] = dataDictionary[dataKey][key].sort_values('MESS_DATUM')

        dataDictionary[dataKey][key]= dataDictionary[dataKey][key].drop('eor', axis= 1)
        dataDictionary[dataKey][key]= dataDictionary[dataKey][key].drop(columns=[col for col in dataDictionary[dataKey][key].columns if col.startswith("QN_")])

In [18]:
#Zu vollen Timeline auffüllen

In [19]:
timeFrame= pd.date_range(start= startZeit, end= endZeit, freq='H')
dfRange= pd.DataFrame({'MESS_DATUM': timeFrame})

for dataKey, dataFrames in dataDictionary.items():
    for key, frames in dataDictionary[dataKey].items():
        dataDictionary[dataKey][key]= dfRange.merge(frames, on= 'MESS_DATUM', how= 'left')

  timeFrame= pd.date_range(start= startZeit, end= endZeit, freq='H')


In [20]:
for dataKey, dataFrames in dataDictionary.items():
    for key, frames in dataDictionary[dataKey].items():
        print(key, frames.isnull().any(axis=1).sum())

tu_00867 251
tu_00656 0
tu_01691 0
tu_02261 0
tu_05371 0
tu_01270 0
tu_02171 87
tu_03946 279
tu_05490 8
tu_02925 125
tu_00198 31
tu_01612 0
tu_04501 144
tu_03231 0
tu_02044 26
tu_03821 3
tu_03513 9
tu_04464 73
tu_07368 0
rr_00198 119
rr_00656 17
rr_00867 350
rr_01270 71
rr_01612 292
rr_01691 260
rr_02044 854
rr_02171 275
rr_02925 222
rr_03231 49
rr_03513 208
rr_03821 3314
rr_03946 6447
rr_04464 3015
rr_04501 374
rr_05490 413
rr_05371 98
rr_02261 62
rr_07368 180
p0_00656 0
p0_00867 142
p0_02261 0
p0_05371 0
p0_01691 0
p0_02171 0
p0_00198 0
p0_01270 0
p0_01612 0
p0_03946 136
p0_05490 0
p0_02925 104
p0_03231 0
p0_04501 0
p0_03513 0
p0_04464 0
p0_02044 0
p0_03821 0
p0_07368 0
ff_00656 651
ff_02261 883
ff_05490 395
ff_05371 98
ff_00198 295
ff_01691 636
ff_00867 635
ff_01270 2
ff_01612 169
ff_02925 581
ff_03946 697
ff_02171 451
ff_04501 613
ff_03231 78
ff_02044 1145
ff_03821 954
ff_03513 395
ff_04464 1040
ff_07368 493


In [21]:
#Pro Station Mergen

In [22]:
stationGroups= defaultdict(list)

for dataKey, dataFrames in dataDictionary.items():
    for key, frames in dataDictionary[dataKey].items():
        stationId= key.split("_")[1]
        stationGroups[stationId].append((dataKey, frames))

In [23]:
stationDataDictionary= {}

for stationId, framesList in stationGroups.items():
    dfMerged= None

    for key, frames in framesList:
        dfRename= frames.copy()
        dfRename= dfRename.rename(columns= lambda col: f"{key}_{col}" if col == "STATIONS_ID" and not key == "FF" else col)

        if dfMerged is None:
            dfMerged= dfRename
        else:
            dfMerged=pd.merge(dfMerged, dfRename, on= 'MESS_DATUM', how= 'outer')
    
    cols= list(dfMerged.columns)
    cols.remove("STATIONS_ID")
    cols.insert(1, "STATIONS_ID")
    stationDataDictionary[stationId]= dfMerged[cols]

In [24]:
for key, frames in stationDataDictionary.items():
    stationIdColsToDrop = [
        col for col in stationDataDictionary[key].columns
        if col.endswith('_STATIONS_ID')
    ]
    stationDataDictionary[key] = stationDataDictionary[key].drop(columns=stationIdColsToDrop)

In [25]:
for key, frames in stationDataDictionary.items():
    stationDataDictionary[key]['STATIONS_ID']= stationDataDictionary[key]['STATIONS_ID'].mode().iloc[0]

    stationDataDictionary[key]= pd.merge(stationDataDictionary[key], methaDataTU, left_on= 'STATIONS_ID', right_on= 'Stations_id', how= 'left')
    stationDataDictionary[key]= stationDataDictionary[key].drop(columns= ["Stations_id"], axis= 1)

In [26]:
for key, frame in stationDataDictionary.items():
    stationDataDictionary[key]['hour'] = frame['MESS_DATUM'].dt.hour
    stationDataDictionary[key]['day'] = frame['MESS_DATUM'].dt.day
    stationDataDictionary[key]['month'] = frame['MESS_DATUM'].dt.month
    stationDataDictionary[key]['hour_sin'] = np.sin(2 * np.pi * frame['hour'] / 24)
    stationDataDictionary[key]['hour_cos'] = np.cos(2 * np.pi * frame['hour'] / 24)
    stationDataDictionary[key]['month_sin'] = np.sin(2 * np.pi * frame['month'] / 12)
    stationDataDictionary[key]['month_cos'] = np.cos(2 * np.pi * frame['month'] / 12)
    stationDataDictionary[key]['day_of_year_sin'] = np.sin(2 * np.pi * frame['MESS_DATUM'].dt.dayofyear / 365)
    stationDataDictionary[key]['day_of_year_cos'] = np.cos(2 * np.pi * frame['MESS_DATUM'].dt.dayofyear / 365)


In [27]:
for key, frame in stationDataDictionary.items():
    print(key, frame.columns)

00867 Index(['MESS_DATUM', 'STATIONS_ID', 'TT_TU', 'RF_TU', '  R1', 'RS_IND', 'WRTR',
       '   P', '  P0', '   F', '   D', 'Stationshoehe', 'geoBreite',
       'geoLaenge', 'Stationsname', 'hour', 'day', 'month', 'hour_sin',
       'hour_cos', 'month_sin', 'month_cos', 'day_of_year_sin',
       'day_of_year_cos'],
      dtype='object')
00656 Index(['MESS_DATUM', 'STATIONS_ID', 'TT_TU', 'RF_TU', '  R1', 'RS_IND', 'WRTR',
       '   P', '  P0', '   F', '   D', 'Stationshoehe', 'geoBreite',
       'geoLaenge', 'Stationsname', 'hour', 'day', 'month', 'hour_sin',
       'hour_cos', 'month_sin', 'month_cos', 'day_of_year_sin',
       'day_of_year_cos'],
      dtype='object')
01691 Index(['MESS_DATUM', 'STATIONS_ID', 'TT_TU', 'RF_TU', '  R1', 'RS_IND', 'WRTR',
       '   P', '  P0', '   F', '   D', 'Stationshoehe', 'geoBreite',
       'geoLaenge', 'Stationsname', 'hour', 'day', 'month', 'hour_sin',
       'hour_cos', 'month_sin', 'month_cos', 'day_of_year_sin',
       'day_of_year_cos'],
  

In [28]:
print(stationDataDictionary)

{'00867':                MESS_DATUM  STATIONS_ID  TT_TU  RF_TU    R1  RS_IND   WRTR  \
0     2014-01-01 00:00:00        867.0   -1.1   91.0   0.0     0.0 -999.0   
1     2014-01-01 01:00:00        867.0   -1.2   93.0   0.0     0.0 -999.0   
2     2014-01-01 02:00:00        867.0   -1.2   93.0   0.0     0.0 -999.0   
3     2014-01-01 03:00:00        867.0   -1.1   92.0   0.0     0.0 -999.0   
4     2014-01-01 04:00:00        867.0   -1.5   94.0   0.0     0.0 -999.0   
...                   ...          ...    ...    ...   ...     ...    ...   
87643 2023-12-31 19:00:00        867.0    5.9   84.0   0.0     0.0    0.0   
87644 2023-12-31 20:00:00        867.0    5.6   81.0   0.0     0.0    0.0   
87645 2023-12-31 21:00:00        867.0    5.0   83.0   0.0     0.0    0.0   
87646 2023-12-31 22:00:00        867.0    4.8   84.0   0.0     0.0    0.0   
87647 2023-12-31 23:00:00        867.0    5.2   80.0   0.0     0.0    0.0   

            P     P0     F  ...          Stationsname  hour  day 

In [29]:
#Unrealistische Werte als Missingvalues deklarieren

In [30]:
for key, frame in stationDataDictionary.items():
    for spalte, (min_wert, max_wert) in werteBereicheDictionary.items():
        if spalte in frame.columns:
            stationDataDictionary[key][spalte] = frame[spalte].where((frame[spalte] >= min_wert) & (frame[spalte] <= max_wert), np.nan)

In [31]:
#Stationen mit zuvielen Missing rausfiltern

In [32]:
for key, frame in stationDataDictionary.items():
    for col in stationFeatures:
        if col in frame.columns:
            missingIndices = frame[frame[col].isna()].index.tolist()
            if missingIndices:
                print(f"Spalte '{col}' hat {len(missingIndices)} fehlende Werte an den Zeilen:")
                print(missingIndices)
                del stationDataDictionary[key]
                break
        else:
            print(f"Spalte '{col}' ist nicht im DataFrame.")

In [33]:
keysToDelete = []
missingValueDictionary= {}

for key, frame in stationDataDictionary.items():
    print(f"\n--- Analyse für Station: {key} ---")
    for col in features:
        if col in frame.columns:
            totalLen = len(frame)
            missingMask = frame[col].isna()
            numMissing = missingMask.sum()
            
            if numMissing > 0:
                missingPct = (numMissing / totalLen) * 100

                # Zusammenhängende NaN-Blöcke zählen
                gaps = []
                count = 0
                i= 0
                while i < len(missingMask):
                    if missingMask[i]:
                        count += 1
                    elif count > 0:
                        gaps.append(count)
                        count = 0
                    i+= 1    
                if count > 0:
                    gaps.append(count)

                gaps_sorted = sorted(gaps, reverse=True)

                print(f"\nSpalte '{col}':")
                print(f"- Fehlend: {numMissing} von {totalLen} Werten ({missingPct:.2f}%)")
                print(f"- Alle Längen (absteigend): {gaps_sorted}")
                print(f"- Anzahl Lücken: {len(gaps_sorted)}")

                if missingPct > maxShareMissingValues * 100:
                    print("\n🚨🚨🚨 STATION GELÖSCHT wegen zu vielen fehlenden Werten! 🚨🚨🚨")
                    keysToDelete.append(key)
                    break

                missingValueDictionary[str(key) + str(col)]= (key, col, missingPct, gaps_sorted)
            else:
                print(f"\nSpalte '{col}' hat keine fehlenden Werte.")
        else:
            print(f"\nSpalte '{col}' ist nicht im DataFrame '{key}'.")

for key in keysToDelete:
    del stationDataDictionary[key]


--- Analyse für Station: 00867 ---

Spalte 'TT_TU':
- Fehlend: 294 von 87648 Werten (0.34%)
- Alle Längen (absteigend): [157, 60, 34, 9, 7, 7, 6, 6, 6, 2]
- Anzahl Lücken: 10

Spalte 'RF_TU':
- Fehlend: 296 von 87648 Werten (0.34%)
- Alle Längen (absteigend): [157, 60, 34, 9, 8, 7, 6, 6, 6, 3]
- Anzahl Lücken: 10

Spalte '  R1':
- Fehlend: 414 von 87648 Werten (0.47%)
- Alle Längen (absteigend): [156, 43, 40, 35, 32, 23, 14, 11, 10, 8, 7, 6, 6, 5, 4, 4, 2, 2, 2, 2, 1, 1]
- Anzahl Lücken: 22

Spalte '  P0':
- Fehlend: 268 von 87648 Werten (0.31%)
- Alle Längen (absteigend): [157, 60, 34, 9, 8]
- Anzahl Lücken: 5

Spalte '   F':
- Fehlend: 645 von 87648 Werten (0.74%)
- Alle Längen (absteigend): [163, 159, 124, 60, 39, 35, 33, 9, 8, 8, 3, 2, 2]
- Anzahl Lücken: 13

--- Analyse für Station: 00656 ---

Spalte 'TT_TU':
- Fehlend: 125 von 87648 Werten (0.14%)
- Alle Längen (absteigend): [71, 18, 13, 12, 6, 5]
- Anzahl Lücken: 6

Spalte 'RF_TU':
- Fehlend: 128 von 87648 Werten (0.15%)
- Alle

In [34]:
#Missing Value Treatment

In [35]:
missingSectionDictionay= {}

for key, frame in stationDataDictionary.items():
    missing= []
    for col in features:
        if col in frame.columns:
            totalLen = len(frame)
            missingMask = frame[col].isna()

            # Zusammenhängende NaN-Blöcke zählen
            count = 0
            i= 0
            while i < totalLen:
                if missingMask[i]:
                    if count == 0:
                        start= i
                    count += 1
                elif count > 0:
                    missing.append((col, start, count))
                    count = 0
                i+= 1    
            if count > 0:
                missing.append((col, start, count))

    missingSectionDictionay[key]= missing

In [36]:
print(missingSectionDictionay)

{'00867': [('TT_TU', 13304, 34), ('TT_TU', 13367, 60), ('TT_TU', 31291, 157), ('TT_TU', 38623, 6), ('TT_TU', 38650, 6), ('TT_TU', 38672, 6), ('TT_TU', 48520, 7), ('TT_TU', 48545, 7), ('TT_TU', 54943, 9), ('TT_TU', 83439, 2), ('RF_TU', 13304, 34), ('RF_TU', 13367, 60), ('RF_TU', 31291, 157), ('RF_TU', 38623, 6), ('RF_TU', 38650, 6), ('RF_TU', 38672, 6), ('RF_TU', 48519, 8), ('RF_TU', 48545, 7), ('RF_TU', 54943, 9), ('RF_TU', 83438, 3), ('  R1', 13304, 35), ('  R1', 13367, 11), ('  R1', 13387, 40), ('  R1', 31292, 156), ('  R1', 38605, 43), ('  R1', 38649, 7), ('  R1', 38793, 8), ('  R1', 38818, 23), ('  R1', 38844, 4), ('  R1', 38865, 10), ('  R1', 38889, 6), ('  R1', 38915, 5), ('  R1', 38921, 2), ('  R1', 38937, 1), ('  R1', 38940, 2), ('  R1', 62169, 2), ('  R1', 70909, 4), ('  R1', 70944, 2), ('  R1', 70948, 6), ('  R1', 70955, 14), ('  R1', 70973, 1), ('  R1', 70996, 32), ('  P0', 13304, 34), ('  P0', 13367, 60), ('  P0', 31291, 157), ('  P0', 48519, 8), ('  P0', 48544, 9), ('   F'

In [37]:
for frameKey, frame in stationDataDictionary.items():
    for key, l in missingSectionDictionay.items():
        if key == frameKey:
            for tripel in l:
                start= tripel[1]
                end= tripel[1] + tripel[2] - 1

                seasonalMeans= stationDataDictionary[key].groupby(['month', 'hour'])[tripel[0]].mean()

                for i in range(start, end + 1):
                    m= frame.loc[i, 'month']
                    h= frame.loc[i, 'hour']
                    meanVal= seasonalMeans.get((m, h), np.nan)

                    stationDataDictionary[key].loc[i, tripel[0]]= meanVal

In [38]:
missingSectionDictionay= {}

for key, frame in stationDataDictionary.items():
    missing= []
    for col in features:
        if col in frame.columns:
            totalLen = len(frame)
            missingMask = frame[col].isna()

            # Zusammenhängende NaN-Blöcke zählen
            count = 0
            i= 0
            while i < totalLen:
                if missingMask[i]:
                    if count == 0:
                        start= i
                    count += 1
                elif count > 0:
                    missing.append((col, start, count))
                    count = 0
                i+= 1    
            if count > 0:
                missing.append((col, start, count))

    missingSectionDictionay[key]= missing

In [39]:
print(missingSectionDictionay)

{'00867': [], '00656': [], '01691': [], '02261': [], '05371': [], '01270': [], '02171': [], '03946': [], '05490': [], '02925': [], '00198': [], '01612': [], '04501': [], '03231': [], '02044': [], '03821': [], '03513': [], '04464': [], '07368': []}


In [40]:
#Dataframe Compination

In [41]:
#Rowwise Compination
rowwiseDf = pd.concat(stationDataDictionary.values(), axis=0, ignore_index=True)

In [42]:
#Rowwise Transforming R1 and F

In [43]:
rowwiseDf.columns

Index(['MESS_DATUM', 'STATIONS_ID', 'TT_TU', 'RF_TU', '  R1', 'RS_IND', 'WRTR',
       '   P', '  P0', '   F', '   D', 'Stationshoehe', 'geoBreite',
       'geoLaenge', 'Stationsname', 'hour', 'day', 'month', 'hour_sin',
       'hour_cos', 'month_sin', 'month_cos', 'day_of_year_sin',
       'day_of_year_cos'],
      dtype='object')

In [44]:
rowwiseDfTransformed= rowwiseDf
rowwiseDfTransformed['   F']= np.log(rowwiseDfTransformed['   F']+1)
rowwiseDfTransformed['  R1']= rowwiseDfTransformed['  R1'].apply(lambda x: 0 if x == 0 else np.log(x + 1))

In [45]:
rowwiseDf

Unnamed: 0,MESS_DATUM,STATIONS_ID,TT_TU,RF_TU,R1,RS_IND,WRTR,P,P0,F,...,Stationsname,hour,day,month,hour_sin,hour_cos,month_sin,month_cos,day_of_year_sin,day_of_year_cos
0,2014-01-01 00:00:00,867.0,-1.1,91.0,0.0,0.0,,1017.4,974.5,1.629241,...,Lautertal-Oberlauter,0,1,1,0.000000,1.000000,5.000000e-01,0.866025,1.721336e-02,0.999852
1,2014-01-01 01:00:00,867.0,-1.2,93.0,0.0,0.0,,1016.9,974.0,1.526056,...,Lautertal-Oberlauter,1,1,1,0.258819,0.965926,5.000000e-01,0.866025,1.721336e-02,0.999852
2,2014-01-01 02:00:00,867.0,-1.2,93.0,0.0,0.0,,1017.0,974.1,1.568616,...,Lautertal-Oberlauter,2,1,1,0.500000,0.866025,5.000000e-01,0.866025,1.721336e-02,0.999852
3,2014-01-01 03:00:00,867.0,-1.1,92.0,0.0,0.0,,1017.0,974.1,1.629241,...,Lautertal-Oberlauter,3,1,1,0.707107,0.707107,5.000000e-01,0.866025,1.721336e-02,0.999852
4,2014-01-01 04:00:00,867.0,-1.5,94.0,0.0,0.0,,1017.1,974.1,1.308333,...,Lautertal-Oberlauter,4,1,1,0.866025,0.500000,5.000000e-01,0.866025,1.721336e-02,0.999852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1665307,2023-12-31 19:00:00,7368.0,7.0,72.0,0.0,0.0,0.0,1004.4,966.9,1.960095,...,Eisenach,19,31,12,-0.965926,0.258819,-2.449294e-16,1.000000,6.432491e-16,1.000000
1665308,2023-12-31 20:00:00,7368.0,6.7,73.0,0.0,0.0,0.0,1004.8,967.3,1.974081,...,Eisenach,20,31,12,-0.866025,0.500000,-2.449294e-16,1.000000,6.432491e-16,1.000000
1665309,2023-12-31 21:00:00,7368.0,6.4,76.0,0.0,0.0,0.0,1005.0,967.4,1.887070,...,Eisenach,21,31,12,-0.707107,0.707107,-2.449294e-16,1.000000,6.432491e-16,1.000000
1665310,2023-12-31 22:00:00,7368.0,6.8,71.0,0.0,0.0,0.0,1004.8,967.3,2.014903,...,Eisenach,22,31,12,-0.500000,0.866025,-2.449294e-16,1.000000,6.432491e-16,1.000000


In [46]:
for col in rowwiseDf.columns:
    tmp= rowwiseDf[col].isna()
    print(col, tmp.sum())

MESS_DATUM 0
STATIONS_ID 0
TT_TU 0
RF_TU 0
  R1 0
RS_IND 20433
WRTR 815913
   P 280080
  P0 0
   F 0
   D 12763
Stationshoehe 0
geoBreite 0
geoLaenge 0
Stationsname 0
hour 0
day 0
month 0
hour_sin 0
hour_cos 0
month_sin 0
month_cos 0
day_of_year_sin 0
day_of_year_cos 0


In [47]:
tmp= rowwiseDf[['Stationsname', 'geoBreite', 'geoLaenge']]
stations_df = tmp.drop_duplicates(subset=['Stationsname'])
stations_df = stations_df.rename(columns={
    'Stationsname': 'Name',
    'geoBreite': 'Latitude',
    'geoLaenge': 'Longitude'
})

stations_df.to_csv("stations_for_map.csv", index=False)

In [48]:
#Columnwise Compination
colwiseDf = pd.concat(stationDataDictionary, axis=1)
colwiseDf.columns = [f"{col}_{station}" for station, col in colwiseDf.columns]

In [49]:
#Daywise Compination

In [50]:
frames= []
for key, frame in stationDataDictionary.items():
    frame['date'] = frame['MESS_DATUM'].dt.date
    tmp= frame.pivot_table(index= 'date', columns= 'hour', values= hourVarying)
    tmp.columns = [f"{var}_{int(hour):02d}" for var, hour in tmp.columns]
    dailyInfo = frame.groupby('date')[unVarying].first()
    frames.append(pd.concat([dailyInfo, tmp], axis=1))



In [51]:
daywiseDf = pd.concat(frames, axis=0, ignore_index= True)

In [52]:
daywiseDf.columns

Index(['STATIONS_ID', 'Stationshoehe', 'geoBreite', 'geoLaenge',
       'Stationsname', 'day', 'month', 'month_sin', 'month_cos',
       'day_of_year_sin',
       ...
       'WRTR_14', 'WRTR_15', 'WRTR_16', 'WRTR_17', 'WRTR_18', 'WRTR_19',
       'WRTR_20', 'WRTR_21', 'WRTR_22', 'WRTR_23'],
      dtype='object', length=227)

In [53]:
daywiseDf

Unnamed: 0,STATIONS_ID,Stationshoehe,geoBreite,geoLaenge,Stationsname,day,month,month_sin,month_cos,day_of_year_sin,...,WRTR_14,WRTR_15,WRTR_16,WRTR_17,WRTR_18,WRTR_19,WRTR_20,WRTR_21,WRTR_22,WRTR_23
0,867.0,344,50.3066,10.9679,Lautertal-Oberlauter,1,1,5.000000e-01,0.866025,1.721336e-02,...,,,,,,,,,,
1,867.0,344,50.3066,10.9679,Lautertal-Oberlauter,2,1,5.000000e-01,0.866025,3.442161e-02,...,,,,,,,,,,
2,867.0,344,50.3066,10.9679,Lautertal-Oberlauter,3,1,5.000000e-01,0.866025,5.161967e-02,...,,,,,,,,,,
3,867.0,344,50.3066,10.9679,Lautertal-Oberlauter,4,1,5.000000e-01,0.866025,6.880243e-02,...,,,,,,,,,,
4,867.0,344,50.3066,10.9679,Lautertal-Oberlauter,5,1,5.000000e-01,0.866025,8.596480e-02,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69383,7368.0,312,51.0007,10.3621,Eisenach,27,12,-2.449294e-16,1.000000,-6.880243e-02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69384,7368.0,312,51.0007,10.3621,Eisenach,28,12,-2.449294e-16,1.000000,-5.161967e-02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
69385,7368.0,312,51.0007,10.3621,Eisenach,29,12,-2.449294e-16,1.000000,-3.442161e-02,...,0.0,0.0,0.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0
69386,7368.0,312,51.0007,10.3621,Eisenach,30,12,-2.449294e-16,1.000000,-1.721336e-02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
#Save Data

In [None]:
with open("..\data\stationDataDictionary.pkl", "wb") as f:
    pickle.dump(stationDataDictionary, f)

with open("..\data\missingValueDictionary.pkl", "wb") as f:
    pickle.dump(missingValueDictionary, f)

rowwiseDf.to_csv(r"..\data\rowwiseDf.csv", index=False)
rowwiseDfTransformed.to_csv(r"..\data\rowwiseDfTransformed.csv", index=False)
colwiseDf.to_csv(r"..\data\colwiseDf.csv", index=False)
daywiseDf.to_csv(r"..\data\daywiseDf.csv", index=False)
rowwiseDf.to_pickle(r"..\data\rowwiseDf.pkl")
rowwiseDfTransformed.to_pickle(r"..\data\rowwiseDfTransformed.pkl")
colwiseDf.to_pickle(r"..\data\colwiseDf.pkl")
daywiseDf.to_pickle(r"..\data\daywiseDf.pkl")

In [8]:
import pandas as pd

rowwiseDf = pd.read_pickle(r"..\data\rowwiseDf.pkl")
rowwiseDfTransformed= pd.read_pickle(r"..\data\rowwiseDfTransformed.pkl")
colwiseDf= pd.read_pickle(r"..\data\colwiseDf.pkl")
daywiseDf= pd.read_pickle(r"..\data\daywiseDf.pkl")
rowwiseDf.to_csv(r"..\data\rowwiseDf.csv", index=False)
rowwiseDfTransformed.to_csv(r"..\data\rowwiseDfTransformed.csv", index=False)
colwiseDf.to_csv(r"..\data\colwiseDf.csv", index=False)
daywiseDf.to_csv(r"..\data\daywiseDf.csv", index=False)

In [None]:
"""independendImpulation= {}
dependendImpulation= {}

for key, liste in missingSectionDictionay.items():
    independendGapList= []
    dependendGapList= []
    for gap in liste:
        for measurment in independendMeasurments:
            if (gap[0] == measurment[0] and gap[2] >= measurment[1] and (gap[2] <= measurment[2] or measurment[2] == -1)):
                independendGapList.append((gap[0], gap[1], gap[2], measurment[3]))
    
        for measurment in dependendMeasurments:
            if (gap[0] == measurment[0] and gap[2] >= measurment[1] and (gap[2] <= measurment[2] or measurment[2] == -1)):
                dependendGapList.append((gap[0], gap[1], gap[2], measurment[3]))

    independendImpulation[key]= independendGapList
    dependendImpulation[key]= dependendGapList"""

In [None]:
#print(independendImpulation)

In [None]:
#print(independendMeasurments)

In [None]:
"""#for frameKey, frame in stationDataDictionary.items():
for frameKey, frame in { '00867': stationDataDictionary['00867'] }.items():
    for key, l in independendImpulation.items():
        if key == frameKey:
            for tripel in l:
                start= tripel[1]
                end= tripel[1] + tripel[2]
                
                if tripel[3] == 'Forward Fill':
                    print('Forward Fill', stationDataDictionary[key][tripel[0]][tripel[1]])
                    stationDataDictionary[key].loc[start:end, tripel[0]]= stationDataDictionary[key][tripel[0]].ffill()[start:end]
                    print(stationDataDictionary[key][tripel[0]][tripel[1]])

                elif tripel[3] == 'FillNA(0)':
                    print('FillNA(0)', stationDataDictionary[key].loc[start:end, tripel[0]])
                    stationDataDictionary[key].loc[start:end, tripel[0]]= 0
                    print(stationDataDictionary[key].loc[start:end, tripel[0]])

                elif tripel[3] == 'Zeitbasierte Interpolation':
                    print('Zeitbasierte Interpolation', stationDataDictionary[key].loc[start-1:end, tripel[0]])
                    stationDataDictionary[key].loc[start:end, tripel[0]]= stationDataDictionary[key][tripel[0]].interpolate(method= 'linear')[start:end]
                    print(stationDataDictionary[key].loc[start:end+1, tripel[0]])

                elif tripel[3] == 'Saisonale Mittelwerte':
                    print("Saisonale Mittelwerte", stationDataDictionary[key].loc[start:end, tripel[0]])
                    seasonalMeans= stationDataDictionary[key].groupby(['month', 'hour'])[tripel[0]].mean()

                    for i in range(start, end + 1):
                        m= frame.loc[i, 'month']
                        h= frame.loc[i, 'hour']
                        meanVal= seasonalMeans.get((m, h), np.nan)

                        stationDataDictionary[key].loc[i, tripel[0]]= meanVal
                    print(stationDataDictionary[key].loc[start:end, tripel[0]])"""

In [None]:
"""missingSectionDictionay1= {}

for key, frame in stationDataDictionary.items():
    missing= []

    for col in features:
        row= 0
        count= 0
        tmp= -1

        while row < len(frame[col]):
            if pd.isna(frame[col][row]):
                count+= 1
                if tmp == -1:
                    tmp= row
            elif tmp != -1:
                missing.append((col, row, count))
                count= 0
                tmp= -1
            else:
                count= 0
            
            row+= 1
        
        if count != 0:
            missing.append((col, row, count))

    missingSectionDictionay1[key]= missing"""

In [None]:
"""independendImpulation1= {}

for key, liste in missingSectionDictionay1.items():
    independendGapList= []
    for gap in liste:
        for measurment in independendMeasurments:
            if (gap[0] == measurment[0] and gap[2] >= measurment[1] and (gap[2] <= measurment[2] or measurment[2] == -1)):
                independendGapList.append((gap[0], gap[1], gap[2], measurment[3]))

    independendImpulation1[key]= independendGapList"""

In [None]:
"""for i in independendImpulation1['00867']:
    if i[0] == 'TT_TU':
        print(i)"""

In [None]:
"""keysToDelete = []

for key, frame in stationDataDictionary.items():
    print(f"\n--- Analyse für Station: {key} ---")
    for col in features:
        if col in frame.columns:
            totalLen = len(frame)
            missingMask = frame[col].isna()
            numMissing = missingMask.sum()
            
            if numMissing > 0:
                missingPct = (numMissing / totalLen) * 100

                # Zusammenhängende NaN-Blöcke zählen
                gaps = []
                count = 0
                for val in missingMask:
                    if val:
                        count += 1
                    elif count > 0:
                        gaps.append(count)
                        count = 0
                if count > 0:
                    gaps.append(count)

                gaps_sorted = sorted(gaps, reverse=True)

                print(f"\nSpalte '{col}':")
                print(f"- Fehlend: {numMissing} von {totalLen} Werten ({missingPct:.2f}%)")
                print(f"- Alle Längen (absteigend): {gaps_sorted}")
                print(f"- Anzahl Lücken: {len(gaps_sorted)}")

                if missingPct > maxShareMissingValues * 100:
                    print("\n🚨🚨🚨 STATION GELÖSCHT wegen zu vielen fehlenden Werten! 🚨🚨🚨")
                    keysToDelete.append(key)
                    break

            else:
                print(f"\nSpalte '{col}' hat keine fehlenden Werte.")
        else:
            print(f"\nSpalte '{col}' ist nicht im DataFrame '{key}'.")

for key in keysToDelete:
    del stationDataDictionary[key]"""

In [None]:
#missingSectionDictionay