In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from collections import defaultdict
from functools import reduce
import os

import seaborn as sns
import matplotlib.pyplot as plt

import pickle

In [None]:
#Eingabe Parameter

In [None]:
längeAnalyse= 10

startZeit= pd.to_datetime("2014-01-01 00:00")
endZeit= pd.to_datetime("2023-12-31 23:00")

#10% unproblematisch, 30% warscheinlich zuviel
maxShareMissingValues= 0.1

features= ['TT_TU', 'RF_TU', '  R1', '  P0', '   F']
stationFeatures= ['MESS_DATUM', 'STATIONS_ID', 'Stationshoehe', 'geoBreite', 'geoLaenge', 'Stationsname']

In [None]:
#realistische Wertebereich

#Lufttemperatur -38 -- 45 ('TT_TU')
#Relative Luftfeuchtigkeit 0 -- 100 ('RF_TU')
#Niederschlagshöhe 0 -- 245 ('  R1')
#Niederschlags Indikator 0 -- 1 ('RS_IND')
#Niederschlags Form 0 -- 9 ('WRTR')
#Luftdruck Meereshöhe 800 -- 1100 ('   P')
#Luftdruck 800 -- 1100 ('  P0')
#Windgeschwindigkeit 0 -- 350 ('   F')
#Windrichtung 0 -- 360 ('   D')

In [None]:
werteBereicheDictionary= {
    'TT_TU': [-38, 45],
    'RF_TU': [0, 100],
    '  R1': [0, 245],
    'RS_IND': [0, 1],
    'WRTR': [0, 9],
    '   P': [800, 1100],
    '  P0': [800, 1100],
    '   F': [0, 350],
    '   D': [0, 360]
}


In [None]:
#Imputation Methoden
#                           1h - 3h         3h - 1 d                    1d - 3d                     > 3d
#TT_TU(Tempreature)         Forward Fill    Zeitbasierte Interpolation  Zeitbasierte Interpolation  Lineare Regression
#RF_TU(Relative Humidity)   Forward Fill    Zeitbasierte Interpolation  Zeitbasierte Interpolation  Random Forest Imputation
#R1(Percipation)            FillNA(0)       Zeitbasierte Interpolation  Saisonale Mittelwerte       Lineare Regression
#P0(Presure)                Forward Fill    Zeitbasierte Interpolation  Zeitbasierte Interpolation  Lineare Regression
#F(Windspeed)               Forward Fill    Zeitbasierte Interpolation  Zeitbasierte Interpolation  Random Forest Imputation

In [None]:
#independendMeasurments= [("TT_TU", 1, 3, "Forward Fill"), ("RF_TU", 1, 3, "Forward Fill"), ("  R1", 1, 3, "FillNA(0)"), ("  P0", 1, 3, "Forward Fill"), ("   F", 1, 3, "Forward Fill"), ("TT_TU", 4, 72, "Zeitbasierte Interpolation"), ("RF_TU", 4, 72, "Zeitbasierte Interpolation"), ("  R1", 4, 24, "Zeitbasierte Interpolation"), ("  P0", 4, 72, "Zeitbasierte Interpolation"), ("   F", 4, 72, "Zeitbasierte Interpolation"), ("  R1", 25, 72, "Saisonale Mittelwerte")]
#dependendMeasurments= [("TT_TU", 73, -1, "Lineare Regression"), ("RF_TU", 73, -1, "Random Forest Imputation"), ("  R1", 73, -1, "Lineare Regression"), ("  P0", 73, -1, "Lineare Regression"), ("   F", 73, -1, "Random Forest Imputation")]

In [None]:
#Files Einlesen (Dictionary pro Typ und Geodaten einzeln)

In [None]:
folder_path = "..\data\TU"

txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

dataframesTU = {}

for file_name in txt_files:
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path, sep= ";")

    #Name creation
    file_base = os.path.splitext(file_name)[0]
    parts = file_base.split("_")
    key = f"{parts[1]}_{parts[-1]}"

    dataframesTU[key] = df

In [None]:
folder_path = "..\data\RR"

txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

dataframesRR = {}

for file_name in txt_files:
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path, sep= ";")

    #Name creation
    file_base = os.path.splitext(file_name)[0]
    parts = file_base.split("_")
    key = f"{parts[1]}_{parts[-1]}"

    dataframesRR[key] = df

In [None]:
folder_path = "..\data\P0"

txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

dataframesP0 = {}

for file_name in txt_files:
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path, sep= ";")

    #Name creation
    file_base = os.path.splitext(file_name)[0]
    parts = file_base.split("_")
    key = f"{parts[1]}_{parts[-1]}"

    dataframesP0[key] = df

In [None]:
folder_path = "..\data\FF"

txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

dataframesFF = {}

for file_name in txt_files:
    file_path = os.path.join(folder_path, file_name)
    
    df = pd.read_csv(file_path, sep= ";")

    #Name creation
    file_base = os.path.splitext(file_name)[0]
    parts = file_base.split("_")
    key = f"{parts[1]}_{parts[-1]}"

    dataframesFF[key] = df

In [None]:
dataDictionary= {
    "TU": dataframesTU,
    "RR": dataframesRR,
    "P0": dataframesP0,
    "FF": dataframesFF
}

In [None]:
columnNames = ["Stations_id", "von_datum", "bis_datum", "Stationshoehe","geoBreite", "geoLaenge", "Stationsname", "Bundesland", "Abgabe"]

methaDataTUtxt= pd.read_fwf("..\data\Metha List\TU_Stundenwerte_Beschreibung_Stationen.txt", encoding="iso-8859-1", skiprows=2, names=columnNames)

methaDataTU= pd.DataFrame(methaDataTUtxt)

methaDataTU= methaDataTU.drop(columns= ["von_datum", "bis_datum", "Bundesland", "Abgabe"])

In [None]:
#Datatype conversion

In [None]:
for dataKey, dataFrames in dataDictionary.items():
    for key, frames in dataDictionary[dataKey].items():
        dataDictionary[dataKey][key]['MESS_DATUM']= pd.to_datetime(frames['MESS_DATUM'].astype(str), format= '%Y%m%d%H')
        dataDictionary[dataKey][key] = dataDictionary[dataKey][key].sort_values('MESS_DATUM')

        dataDictionary[dataKey][key]= dataDictionary[dataKey][key].drop('eor', axis= 1)
        dataDictionary[dataKey][key]= dataDictionary[dataKey][key].drop(columns=[col for col in dataDictionary[dataKey][key].columns if col.startswith("QN_")])

In [None]:
#Zu vollen Timeline auffüllen

In [None]:
timeFrame= pd.date_range(start= startZeit, end= endZeit, freq='H')
dfRange= pd.DataFrame({'MESS_DATUM': timeFrame})

for dataKey, dataFrames in dataDictionary.items():
    for key, frames in dataDictionary[dataKey].items():
        dataDictionary[dataKey][key]= dfRange.merge(frames, on= 'MESS_DATUM', how= 'left')

In [None]:
for dataKey, dataFrames in dataDictionary.items():
    for key, frames in dataDictionary[dataKey].items():
        print(key, frames.isnull().any(axis=1).sum())

In [None]:
#Pro Station Mergen

In [None]:
stationGroups= defaultdict(list)

for dataKey, dataFrames in dataDictionary.items():
    for key, frames in dataDictionary[dataKey].items():
        stationId= key.split("_")[1]
        stationGroups[stationId].append((dataKey, frames))

In [None]:
stationDataDictionary= {}

for stationId, framesList in stationGroups.items():
    dfMerged= None

    for key, frames in framesList:
        dfRename= frames.copy()
        dfRename= dfRename.rename(columns= lambda col: f"{key}_{col}" if col == "STATIONS_ID" and not key == "FF" else col)

        if dfMerged is None:
            dfMerged= dfRename
        else:
            dfMerged=pd.merge(dfMerged, dfRename, on= 'MESS_DATUM', how= 'outer')
    
    cols= list(dfMerged.columns)
    cols.remove("STATIONS_ID")
    cols.insert(1, "STATIONS_ID")
    stationDataDictionary[stationId]= dfMerged[cols]

In [None]:
for key, frames in stationDataDictionary.items():
    stationIdColsToDrop = [
        col for col in stationDataDictionary[key].columns
        if col.endswith('_STATIONS_ID')
    ]
    stationDataDictionary[key] = stationDataDictionary[key].drop(columns=stationIdColsToDrop)

In [None]:
for key, frames in stationDataDictionary.items():
    stationDataDictionary[key]['STATIONS_ID']= stationDataDictionary[key]['STATIONS_ID'].mode().iloc[0]

    stationDataDictionary[key]= pd.merge(stationDataDictionary[key], methaDataTU, left_on= 'STATIONS_ID', right_on= 'Stations_id', how= 'left')
    stationDataDictionary[key]= stationDataDictionary[key].drop(columns= ["Stations_id"], axis= 1)

In [None]:
for key, frame in stationDataDictionary.items():
    stationDataDictionary[key]['hour'] = frame['MESS_DATUM'].dt.hour
    stationDataDictionary[key]['day'] = frame['MESS_DATUM'].dt.day
    stationDataDictionary[key]['month'] = frame['MESS_DATUM'].dt.month
    stationDataDictionary[key]['hour_sin'] = np.sin(2 * np.pi * frame['hour'] / 24)
    stationDataDictionary[key]['hour_cos'] = np.cos(2 * np.pi * frame['hour'] / 24)
    stationDataDictionary[key]['month_sin'] = np.sin(2 * np.pi * frame['month'] / 12)
    stationDataDictionary[key]['month_cos'] = np.cos(2 * np.pi * frame['month'] / 12)

In [None]:
for key, frame in stationDataDictionary.items():
    print(key, frame.columns)

In [None]:
print(stationDataDictionary)

In [None]:
#Unrealistische Werte als Missingvalues deklarieren

In [None]:
for key, frame in stationDataDictionary.items():
    for spalte, (min_wert, max_wert) in werteBereicheDictionary.items():
        if spalte in frame.columns:
            stationDataDictionary[key][spalte] = frame[spalte].where((frame[spalte] >= min_wert) & (frame[spalte] <= max_wert), np.nan)

In [None]:
#Stationen mit zuvielen Missing rausfiltern

In [None]:
for key, frame in stationDataDictionary.items():
    for col in stationFeatures:
        if col in frame.columns:
            missingIndices = frame[frame[col].isna()].index.tolist()
            if missingIndices:
                print(f"Spalte '{col}' hat {len(missingIndices)} fehlende Werte an den Zeilen:")
                print(missingIndices)
                del stationDataDictionary[key]
                break
        else:
            print(f"Spalte '{col}' ist nicht im DataFrame.")

In [None]:
keysToDelete = []
missingValueDictionary= {}

for key, frame in stationDataDictionary.items():
    print(f"\n--- Analyse für Station: {key} ---")
    for col in features:
        if col in frame.columns:
            totalLen = len(frame)
            missingMask = frame[col].isna()
            numMissing = missingMask.sum()
            
            if numMissing > 0:
                missingPct = (numMissing / totalLen) * 100

                # Zusammenhängende NaN-Blöcke zählen
                gaps = []
                count = 0
                for val in missingMask:
                    if val:
                        count += 1
                    elif count > 0:
                        gaps.append(count)
                        count = 0
                if count > 0:
                    gaps.append(count)

                gaps_sorted = sorted(gaps, reverse=True)

                print(f"\nSpalte '{col}':")
                print(f"- Fehlend: {numMissing} von {totalLen} Werten ({missingPct:.2f}%)")
                print(f"- Alle Längen (absteigend): {gaps_sorted}")
                print(f"- Anzahl Lücken: {len(gaps_sorted)}")

                if missingPct > maxShareMissingValues * 100:
                    print("\n🚨🚨🚨 STATION GELÖSCHT wegen zu vielen fehlenden Werten! 🚨🚨🚨")
                    keysToDelete.append(key)
                    break

                missingValueDictionary[str(key) + str(col)]= (key, col, missingPct, gaps_sorted)
            else:
                print(f"\nSpalte '{col}' hat keine fehlenden Werte.")
        else:
            print(f"\nSpalte '{col}' ist nicht im DataFrame '{key}'.")

for key in keysToDelete:
    del stationDataDictionary[key]

In [None]:
#Dataframe Compination

In [None]:
#Rowwise Compination
rowwiseDf = pd.concat(stationDataDictionary.values(), axis=0, ignore_index=True)

In [None]:
#Columnwise Compination
colwiseDf = pd.concat(stationDataDictionary, axis=1)
colwiseDf.columns = [f"{col}_{station}" for station, col in colwiseDf.columns]

In [None]:
print(colwiseDf)

In [None]:
#Save Data

In [89]:
with open("..\data\stationDataDictionary.pkl", "wb") as f:
    pickle.dump(stationDataDictionary, f)

with open("..\data\missingValueDictionary.pkl", "wb") as f:
    pickle.dump(missingValueDictionary, f)

rowwiseDf.to_pickle(r"..\data\rowwiseDf.pkl")
colwiseDf.to_pickle(r"..\data\colwiseDf.pkl")

In [None]:
#Missing Value Treatment

In [None]:
"""missingSectionDictionay= {}

for key, frame in stationDataDictionary.items():
    missing= []

    for col in features:
        row= 0
        count= 0
        tmp= -1

        while row < len(frame[col]):
            if pd.isna(frame[col][row]):
                count+= 1
                if tmp == -1:
                    tmp= row
            elif tmp != -1:
                missing.append((col, row, count))
                count= 0
                tmp= -1
            else:
                count= 0
            
            row+= 1
        
        if count != 0:
            missing.append((col, row, count))

    missingSectionDictionay[key]= missing"""

In [None]:
"""independendImpulation= {}
dependendImpulation= {}

for key, liste in missingSectionDictionay.items():
    independendGapList= []
    dependendGapList= []
    for gap in liste:
        for measurment in independendMeasurments:
            if (gap[0] == measurment[0] and gap[2] >= measurment[1] and (gap[2] <= measurment[2] or measurment[2] == -1)):
                independendGapList.append((gap[0], gap[1], gap[2], measurment[3]))
    
        for measurment in dependendMeasurments:
            if (gap[0] == measurment[0] and gap[2] >= measurment[1] and (gap[2] <= measurment[2] or measurment[2] == -1)):
                dependendGapList.append((gap[0], gap[1], gap[2], measurment[3]))

    independendImpulation[key]= independendGapList
    dependendImpulation[key]= dependendGapList"""

In [None]:
#print(independendImpulation)
#print(dependendImpulation)

In [None]:
#print(independendMeasurments)

In [None]:
"""for frameKey, frame in stationDataDictionary.items():
    for key, liste in independendImpulation.items():
        if key != frameKey:
            for tripel in liste:
                start= tripel[1]
                end= tripel[1] + tripel[2]
                
                if tripel[3] == 'Forward Fill':
                    stationDataDictionary[key].loc[start:end, tripel[0]]= stationDataDictionary[key][tripel[0]].ffill()[start:end]

                elif tripel[3] == 'FillNA(0)':
                    stationDataDictionary[key].loc[start:end, tripel[0]]= 0

                elif tripel[3] == 'Zeitbasierte Interpolation':
                    stationDataDictionary[key].loc[start:end, tripel[0]]= stationDataDictionary[key][tripel[0]].interpolate(method= 'linear')[start:end]

                elif tripel[3] == 'Saisonale Mittelwerte':
                    seasonalMeans= stationDataDictionary[key].groupby(['month', 'hour'])[tripel[0]].mean()

                    for i in range(start, end + 1):
                        m= frame.loc[i, 'month']
                        h= frame.loc[i, 'hour']
                        meanVal= seasonalMeans.get((m, h), np.nan)

                        stationDataDictionary[key].loc[i, tripel[0]]= meanVal"""

In [None]:
"""keysToDelete = []

for key, frame in stationDataDictionary.items():
    print(f"\n--- Analyse für Station: {key} ---")
    for col in features:
        if col in frame.columns:
            totalLen = len(frame)
            missingMask = frame[col].isna()
            numMissing = missingMask.sum()
            
            if numMissing > 0:
                missingPct = (numMissing / totalLen) * 100

                # Zusammenhängende NaN-Blöcke zählen
                gaps = []
                count = 0
                for val in missingMask:
                    if val:
                        count += 1
                    elif count > 0:
                        gaps.append(count)
                        count = 0
                if count > 0:
                    gaps.append(count)

                gaps_sorted = sorted(gaps, reverse=True)

                print(f"\nSpalte '{col}':")
                print(f"- Fehlend: {numMissing} von {totalLen} Werten ({missingPct:.2f}%)")
                print(f"- Alle Längen (absteigend): {gaps_sorted}")
                print(f"- Anzahl Lücken: {len(gaps_sorted)}")

                if missingPct > maxShareMissingValues * 100:
                    print("\n🚨🚨🚨 STATION GELÖSCHT wegen zu vielen fehlenden Werten! 🚨🚨🚨")
                    keysToDelete.append(key)
                    break

            else:
                print(f"\nSpalte '{col}' hat keine fehlenden Werte.")
        else:
            print(f"\nSpalte '{col}' ist nicht im DataFrame '{key}'.")

for key in keysToDelete:
    del stationDataDictionary[key]"""

In [None]:
#missingSectionDictionay