In [94]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from collections import defaultdict
from functools import reduce
import os
import joblib
import pickle
from pathlib import Path
import torch
import gpytorch

from joblib import Parallel, delayed
from scipy.stats import multivariate_normal
import optuna
from optuna.samplers import TPESampler
from typing import Tuple

In [95]:
#Input Parameters
randomState= 21
nJobs= -1
nTrials= 20
topK= 3
splitPrecentage= 0.8
hyperparameterTrainingSubset= 0.15
notebook_dir = Path().resolve()
storagePath = notebook_dir.parent / "data" / "rowWiseGPModel"
Stations= ["Erfurt-Weimar", "Schmücke", "Eisenach", "Artern", "Neuhaus am Rennweg","Meiningen"]
ReducedJahre= 4

In [96]:
#Laden der Rowwise Daten da bessser geeignet
rowwiseDf = pd.read_pickle("..\\data\\rowwiseDf.pkl")

In [97]:
rowwiseDf.columns

Index(['MESS_DATUM', 'STATIONS_ID', 'TT_TU', 'RF_TU', '  R1', 'RS_IND', 'WRTR',
       '   P', '  P0', '   F', '   D', 'Stationshoehe', 'geoBreite',
       'geoLaenge', 'Stationsname', 'hour', 'day', 'month', 'hour_sin',
       'hour_cos', 'month_sin', 'month_cos', 'day_of_year_sin',
       'day_of_year_cos'],
      dtype='object')

In [98]:
distanceDf= rowwiseDf[['geoBreite', 'geoLaenge', 'Stationsname', 'STATIONS_ID']].drop_duplicates()

In [99]:
lat1 = None
lon1 = None
for _, row in distanceDf.iterrows():
    if row['Stationsname'] == 'Erfurt-Weimar':
        lat1 = np.radians(row['geoBreite'])
        lon1 = np.radians(row['geoLaenge'])
        break  # reicht, wenn wir den ersten passenden Eintrag haben

# 2. Distanz für alle anderen Stationen berechnen
distances = []
for _, row in distanceDf.iterrows():
    if row['Stationsname'] == 'Erfurt-Weimar':
        continue
    
    lat2 = np.radians(row['geoBreite'])
    lon2 = np.radians(row['geoLaenge'])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    distance = 2 * 6371 * np.arcsin(np.sqrt(a))  # in km
    
    distances.append({
        'Stationsname': row['Stationsname'],
        'distance_km': distance
    })

# 3. In DataFrame umwandeln und sortieren
distance_result = pd.DataFrame(distances).sort_values('distance_km')
print(distance_result)

            Stationsname  distance_km
11              Schmücke    38.908361
17              Eisenach    41.949378
9                 Artern    49.275834
15    Neuhaus am Rennweg    55.036985
12             Meiningen    62.326012
8             Leinefelde    64.219198
14             Osterfeld    68.707104
16               Schleiz    75.136254
0   Lautertal-Oberlauter    75.202795
13            Harzgerode    75.396944
10         Gera-Leumnitz    82.636570
1              Braunlage    86.024839
5          Hersfeld, Bad    86.966964
4            Wasserkuppe    89.710348
2              Göttingen    90.844191
7            Wernigerode    96.827360
3                    Hof    98.609755
6                 Plauen    99.376594


In [100]:
distanceDf

Unnamed: 0,geoBreite,geoLaenge,Stationsname,STATIONS_ID
0,50.3066,10.9679,Lautertal-Oberlauter,867.0
87648,51.7234,10.6021,Braunlage,656.0
175296,51.5002,9.9507,Göttingen,1691.0
262944,50.3123,11.876,Hof,2261.0
350592,50.4973,9.9427,Wasserkuppe,5371.0
438240,50.9829,10.9608,Erfurt-Weimar,1270.0
525888,50.852,9.7377,"Hersfeld, Bad",2171.0
613536,50.4818,12.13,Plauen,3946.0
701184,51.8454,10.7686,Wernigerode,5490.0
788832,51.3932,10.3123,Leinefelde,2925.0


In [101]:
#reduziere Daten größe damit Modell berechnbar bleibt
reducedDf= rowwiseDf.copy()
reducedDf['MESS_DATUM']= pd.to_datetime(reducedDf['MESS_DATUM'])

maxDate= reducedDf['MESS_DATUM'].max()
cutoffDate= maxDate - pd.DateOffset(years= ReducedJahre)

reducedDf= reducedDf[(reducedDf['Stationsname'].isin(Stations)) & (reducedDf['MESS_DATUM'] >= cutoffDate)]


In [102]:
reducedDf['time_hours'] = (reducedDf['MESS_DATUM'] - reducedDf['MESS_DATUM'].min()).dt.total_seconds() / 3600

In [103]:
#month wird gedropt da hoch koriliert mit day of year

In [104]:
reducedDf= reducedDf.drop(columns= ['MESS_DATUM', 'STATIONS_ID', 'RS_IND', 'WRTR','   P', '   D', 'Stationsname', 'hour', 'day', 'month', 'month_sin', 'month_cos'])

In [105]:
reducedDf.columns

Index(['TT_TU', 'RF_TU', '  R1', '  P0', '   F', 'Stationshoehe', 'geoBreite',
       'geoLaenge', 'hour_sin', 'hour_cos', 'day_of_year_sin',
       'day_of_year_cos', 'time_hours'],
      dtype='object')

In [106]:
X= reducedDf.values

In [107]:
#Nutzen eines Scalers für alle Datensätze führt zu Data Leakage aber das unvermeit bar wenn nicht jedesmal tueres hyperparameter tuning betrieben werden soll
scaler = StandardScaler()
xScaled= scaler.fit_transform(X)

In [108]:
metaFeaturetraingDf= reducedDf.sample(frac= hyperparameterTrainingSubset, random_state= randomState)

In [109]:
len(metaFeaturetraingDf.index)

31558

In [110]:
XMeta= metaFeaturetraingDf.values.astype(np.float32)

In [111]:
xScaledMeta= scaler.transform(XMeta)

In [112]:
y_proxy = metaFeaturetraingDf['TT_TU'].values.astype(np.float32)

In [113]:
X_train, X_val, y_train, y_val = train_test_split(xScaledMeta, y_proxy, train_size=splitPrecentage, random_state=randomState)

X_train = torch.tensor(X_train)
y_train = torch.tensor(y_train)
X_val = torch.tensor(X_val)
y_val = torch.tensor(y_val)

In [114]:
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, ard_num_dims, hyperparams):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)

        matern = gpytorch.kernels.MaternKernel(nu=hyperparams['nu'], ard_num_dims=ard_num_dims)
        periodic = gpytorch.kernels.PeriodicKernel(ard_num_dims=ard_num_dims)
        rbf = gpytorch.kernels.RBFKernel(ard_num_dims=ard_num_dims)

        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = matern + periodic + rbf

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

In [115]:
def objective(trial):
    # Hyperparameter-Raum
    nu = trial.suggest_categorical("nu", [0.5, 1.5, 2.5])
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
    training_iter = trial.suggest_int("training_iter", 50, 150)

    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(X_train, y_train, likelihood, ard_num_dims=X_train.shape[1],
                         hyperparams={'nu': nu})

    model.train()
    likelihood.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    # Training
    for i in range(training_iter):
        optimizer.zero_grad()
        output = model(X_train)
        loss = -mll(output, y_train)
        loss.backward()
        optimizer.step()

    # Validation Loss (Negative Log-Likelihood)
    model.eval()
    likelihood.eval()
    with torch.no_grad(), gpytorch.settings.fast_pred_var():
        val_output = model(X_val)
        val_loss = -mll(val_output, y_val).item()

    return val_loss

In [116]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

[I 2025-08-11 23:20:22,117] A new study created in memory with name: no-name-a3fe86b5-e35b-4f59-8c05-3d377eed1397
  lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
[W 2025-08-11 23:21:20,160] Trial 0 failed with parameters: {'nu': 0.5, 'lr': 0.0003419327889161724, 'training_iter': 75} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\johan\AppData\Local\Programs\Python\Python310\lib\site-packages\optuna\study\_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\johan\AppData\Local\Temp\ipykernel_12452\3048491020.py", line 21, in objective
    loss = -mll(output, y_train)
  File "c:\Users\johan\AppData\Local\Programs\Python\Python310\lib\site-packages\gpytorch\module.py", line 82, in __call__
    outputs = self.forward(*inputs, **kwargs)
  File "c:\Users\johan\AppData\Local\Programs\Python\Python310\lib\site-packages\gpytorch\mlls\exact_marginal_log_likelihood.py", line 82, in forward
    res = o

KeyboardInterrupt: 

In [None]:
print("Beste Hyperparameter:", study.best_params)