In [3]:
import numpy as np
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from functools import reduce
import os
import joblib
import pickle
from pathlib import Path

from joblib import Parallel, delayed
from scipy.stats import multivariate_normal
import optuna
from optuna.samplers import TPESampler
from typing import Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Input Parameters
randomState= 21
nComponents = list(range(1, 3)) #sinvolle Werte 1-20
nInit = list(range(1, 11)) #sinvolle Werte 1-10
nIter = [50, 100, 150, 200, 250, 300] #sinvolle Werte 50-300
nInitFirstStep= 5
nIterFirstStep= 150
nJobs= -1
nTrials= 20
topK= 3
notebook_dir = Path().resolve()
storagePath = notebook_dir.parent / "data" / "rowWiseModel"


In [None]:
#Laden der Rowwise Daten da bessser geeignet
rowwiseDf = pd.read_pickle("..\\data\\rowwiseDf.pkl")

In [6]:
rowwiseDf= rowwiseDf.drop(columns= ['MESS_DATUM', 'STATIONS_ID', 'RS_IND', 'WRTR','   P', '   D', 'Stationsname', 'hour', 'day', 'month'])

In [7]:
rowwiseDf.columns

Index(['TT_TU', 'RF_TU', '  R1', '  P0', '   F', 'Stationshoehe', 'geoBreite',
       'geoLaenge', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos',
       'day_of_year_sin', 'day_of_year_cos'],
      dtype='object')

In [10]:
rowwiseDf.isnull().sum().sum()

np.int64(0)

In [8]:
X = rowwiseDf.values

In [None]:
scaler= StandardScaler()
xScaled= scaler.fit_transform(X)

In [None]:
os.makedirs(storagePath, exist_ok=True)

In [None]:
def fit_gmm_and_save(X, n_components, modelDir):
    model = GaussianMixture(n_components=n_components, n_init= nInitFirstStep, max_iter= nIterFirstStep, covariance_type='full', random_state=randomState)
    model.fit(X)

    modelPath = os.path.join(modelDir, f"gmm_{n_components}_components.pkl")
    with open(modelPath, 'wb') as f:
        pickle.dump(model, f)

    return {
        "n_components": n_components,
        "bic": model.bic(X),
        "aic": model.aic(X),
        "weights": model.weights_,
        "converged": model.converged_,
        "n_iter": model.n_iter_,
        "model_path": modelPath
    }

In [None]:
def train_gmm_parallel(X, componentRange, modelDir="gmm_models", n_jobs= -1):
    os.makedirs(modelDir, exist_ok=True)

    results = Parallel(n_jobs=n_jobs)(
        delayed(fit_gmm_and_save)(X, nComp, modelDir)
        for nComp in componentRange
    )

    return pd.DataFrame(results)

In [None]:
if os.path.exists(storagePath/"gmm_results.csv"):
    resultsDfExisting = pd.read_csv(storagePath/"gmm_results.csv")
else:
    resultsDfExisting = pd.DataFrame()

existingModels = set()
for f in os.listdir(storagePath):
    if f.endswith(".pkl") and "components" in f:
        parts = f.split("_")
        try:
            # Erwartet: 'gmm_1_components.pkl' => parts[0] = 'gmm', parts[1] = '1', parts[2] = 'components.pkl'
            n_comp = int(parts[1])
            existingModels.add(n_comp)
        except (IndexError, ValueError):
            continue
pending = [n for n in nComponents if n not in existingModels]

if pending:
    resultsDfNew  = train_gmm_parallel(X, pending, modelDir= storagePath)
else:
    print("Alle Modelle bereits trainiert.")

results_df = pd.concat([resultsDfExisting, resultsDfNew]).drop_duplicates("n_components")
results_df.to_csv(storagePath/"gmm_results.csv", index=False)


In [None]:
topComponents = results_df.nsmallest(topK, "bic")["n_components"].tolist()

In [None]:
def objective(trial, X, n_components, model_dir):
    n_init = trial.suggest_categorical("n_init", nInit)
    max_iter = trial.suggest_categorical("max_iter", nIter)

    model_filename = f"gmm_{n_components}_opt_{n_init}_{max_iter}.pkl"
    model_path = os.path.join(model_dir, model_filename)

    if os.path.exists(model_path):
        print(f"⚠️ Modell bereits vorhanden: {model_filename} — wird übersprungen.")
        raise optuna.TrialPruned()

    model = GaussianMixture(n_components=n_components, 
                            n_init=n_init, 
                            max_iter=max_iter, 
                            random_state=42)
    model.fit(X)
    
    # Modell speichern
    model_path = os.path.join(model_dir, f"gmm_{n_components}_opt_{n_init}_{max_iter}.pkl")
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)

    return model.bic(X)

In [None]:
studies= []
for i, n_comp in enumerate(topComponents):
    study_name = f"gmm_{n_comp}_opt"
    storage_path = f"sqlite:///{os.path.join(storagePath, f'{study_name}.db')}"
    
    studies[i] = optuna.create_study(direction="minimize", study_name=study_name, 
                                storage=storage_path, load_if_exists=True)

    studies[i].optimize(lambda trial: objective(trial, X, n_comp, storagePath), 
                   n_trials=20, n_jobs= nJobs,
                   catch=(Exception,))

[I 2025-07-02 22:43:59,431] A new study created in RDB with name: gmm_2_opt


In [None]:
for study in studies:
    params= study.best_params

ValueError: Record does not exist.

In [None]:
"""model = GaussianMixture(
    n_components=params["n_components"],
    n_init=params["n_init"],
    max_iter=params["max_iter"]
)
model.fit(xScaled)

# BIC ausgeben
best_bic = model.bic(xScaled)
print("Bester BIC:", best_bic)"""

Bester BIC: 15452547.706960948


In [None]:
"""modelPath = os.path.join("..", "data", "gmm_model.pkl")
studyPath = os.path.join("..", "data", "gmm_study.pkl")

joblib.dump({'scaler': scaler, 'gmm': model}, modelPath)
joblib.dump(study, studyPath)"""

['..\\data\\gmm_study.pkl']

In [None]:
"""tmp = pd.DataFrame(X, columns= rowwiseDf.columns)
real_df = tmp[(tmp['Stationshoehe'] == 316) & (tmp['geoBreite'].sub(50.9829).abs() < 1e-4) & (tmp['geoLaenge'].sub(10.9608).abs() < 1e-4)]
fake_df = pd.DataFrame(samplesOriginal, columns= rowwiseDf.columns)

comparison = pd.DataFrame({
    'mean_real': real_df.mean(),
    'mean_fake': fake_df.mean(),
    'std_real': real_df.std(),
    'std_fake': fake_df.std(),
    'max_real': real_df.max(),
    'max_fake': fake_df.max(),
    'min_real': real_df.min(),
    'min_fake': fake_df.min()
})

print(comparison)"""

                    mean_real   mean_fake      std_real      std_fake  \
TT_TU            9.964315e+00    9.564061  7.902395e+00  7.647740e+00   
RF_TU            7.612188e+01   77.373365  1.719759e+01  1.740857e+01   
  R1             5.507807e-02    0.054244  4.708126e-01  3.819002e-01   
  P0             9.781357e+02  978.811000  8.728157e+00  8.147721e+00   
   F             4.209897e+00    3.241531  2.452403e+00  1.912893e+00   
Stationshoehe    3.160000e+02  316.000000  0.000000e+00  0.000000e+00   
geoBreite        5.098290e+01   50.982900  2.131640e-14  1.421797e-14   
geoLaenge        1.096080e+01   10.960800  3.552734e-15  3.554491e-15   
hour_sin        -1.838212e-17    0.015084  7.071108e-01  7.079012e-01   
hour_cos        -5.543008e-17    0.038608  7.071108e-01  7.078157e-01   
month_sin       -4.903879e-03   -0.035284  7.057371e-01  7.181424e-01   
month_cos       -2.097550e-03    0.026735  7.084618e-01  6.949487e-01   
day_of_year_sin  9.426811e-06   -0.034772  7.069173

In [None]:
#get the components stuff for explainebilty jeder component ist ein Mean
#PCA für visulastion (funtkioniert nicht diskutieren)
#Regresision within sample as possiblity (funktioniert gut)
#maybe classification but not sure 
#should i try to generate hole day data (yes)
#should i look at time sieres evaluation or just show my prelimenary results (prelimenary is fine)
#Station conditional generation for central station one random on the edge and 1 or 2 in between (batches)
#Gaussian Process as Timesieres Generalistation of GMM (auch noch Ankucken)
#Mixture Factor Analyzer Model (MFA) aufjedenfall ankucken
#Eher fokus auf auswertung oder noch Daussian Processes
#LDA first

#Lef the station out Important
#Regolissian for Ridge

#Power Transformation für Regen und Wind 

#Gaussian Process und Hiden Markov models nutzen wobei anzahl zustände gleich anzahl optimaler cluster aus erster analyse ist