In [1]:
import sys
import os

# Einen Ordner hochgehen (du bist in /notebooks, Modell liegt eine Ebene dar√ºber)
os.chdir("..")

# src/ zum Python-Pfad hinzuf√ºgen
sys.path.append(os.getcwd())

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import itertools
import os
import joblib

In [None]:
# ============================================
# üî• FULL WORKING LIGHTGBM TUNING PIPELINE üî•
# Kein Preprocessor n√∂tig ‚Äì alles hier drin.
# ============================================

from src.data.load_data import load_data
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

import pandas as pd
import numpy as np
import itertools
import joblib
import os

# =============================
# Konstanten
# =============================
TARGET_END = "Auftragsende_IST"
START_COL = "Auftragseingang"
DATE_COLS = [
    "Auftragseingang",
    "Auftragsende_SOLL",
    "AFO_Start_SOLL",
    "AFO_Ende_SOLL",
    "AFO_Start_IST",
    "AFO_Ende_IST",
]

# =============================
# 1) Daten laden
# =============================
data = load_data()

# Datumsfelder ‚Üí Datetime
for col in DATE_COLS + [TARGET_END]:
    data[col] = pd.to_datetime(data[col], errors="coerce")

# g√ºltige rows filtern
mask = (~data[TARGET_END].isna()) & (~data[START_COL].isna())
data = data[mask].copy()

start_dt = data[START_COL]

# Dauer in Tagen als Target
y = (data[TARGET_END] - start_dt).dt.total_seconds() / 86400.0
y = y.astype("float32")

# =============================
# 2) Date-Features erstellen
# =============================
for col in DATE_COLS:
    data[f"{col}_dow"] = data[col].dt.dayofweek
    data[f"{col}_hour"] = data[col].dt.hour
    data[f"{col}_day"]  = data[col].dt.day
    data[f"{col}_month"] = data[col].dt.month
    data[f"{col}_week"] = data[col].dt.isocalendar().week.astype(int)

# Originaldatum remove
data = data.drop(columns=DATE_COLS + [TARGET_END])

# IDs entfernen
for col in ["AuftragsID", "BauteilID", "MaschinenID"]:
    if col in data.columns:
        data = data.drop(columns=[col])

# =============================
# 3) Preprocessor definieren
# =============================
categorical = data.select_dtypes(include=["object"]).columns.tolist()
numeric     = data.select_dtypes(include=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]), categorical),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
        ]), numeric),
    ]
)

# =============================
# 4) Train/Test Split
# =============================
X_train, X_test, y_train, y_test = train_test_split(
    data, y, test_size=0.35, random_state=32
)

# =============================
# 5) GRID-TUNING
# =============================
param_grid = {
    "n_estimators": [350, 470],         # 2 Werte
    "num_leaves": [96, 128, 256],       # 3 Werte
    "min_child_samples": [30, 60],      # 2 Werte
    "subsample": [0.85, 0.9],           # 2 Werte
    "colsample_bytree": [0.85, 0.9],    # 2 Werte
}

combos = list(itertools.product(
    param_grid["n_estimators"],
    param_grid["num_leaves"],
    param_grid["min_child_samples"],
    param_grid["subsample"],
    param_grid["colsample_bytree"],
))

print(f"\nüîç Starte Tuning √ºber {len(combos)} Kombinationen...\n")

best_mae = 99999
best_model = None
best_params = None

for i, (N, L, M, SUB, COL) in enumerate(combos, 1):

    print(f"üîÅ Test {i}/{len(combos)}")

    model = LGBMRegressor(
        n_estimators=N,
        learning_rate=0.03,
        num_leaves=L,
        min_child_samples=M,
        subsample=SUB,
        colsample_bytree=COL,
        n_jobs=-1,
        random_state=42
    )

    pipe_tmp = Pipeline([
        ("prep", preprocessor),
        ("model", model)
    ])

    pipe_tmp.fit(X_train, y_train)
    preds = pipe_tmp.predict(X_test)
    mae = mean_absolute_error(y_test, preds)

    print(f"‚û°Ô∏è MAE {mae:.3f} | n={N}, leaves={L}, min_child={M}, subs={SUB}, col={COL}")

    if mae < best_mae:
        best_mae = mae
        best_params = (N, L, M, SUB, COL)
        best_model = pipe_tmp

# =============================
# 6) Ergebnis
# =============================
print("\n==============================")
print("üèÜ BESTE PARAMETER GEFUNDEN")
print("==============================")
print("MAE:", best_mae)
print("n_estimators:", best_params[0])
print("num_leaves:", best_params[1])
print("min_child_samples:", best_params[2])
print("subsample:", best_params[3])
print("colsample_bytree:", best_params[4])

# Speichern
os.makedirs("models/lightgbm/pipeline", exist_ok=True)
joblib.dump(best_model, "models/lightgbm/pipeline/best_lgbm_pipeline.pkl")

print("\nüì¶ Modells gespeichert!")

Spalten im DataFrame:
['AuftragsID', 'BauteilID', 'Bauteilbezeichnung', 'Auftragseingang', 'Priorit√§t', 'Auftragsende_SOLL', 'Arbeitsschritt', 'Arbeitsschrittbezeichnung', 'AFO_Start_SOLL', 'AFO_Ende_SOLL', 'AFO_Start_IST', 'AFO_Ende_IST', 'MaschinenID', 'Maschinenbezeichnung', 'Auftragsende_IST']

üîç Starte Tuning √ºber 192 Kombinationen...

üîÅ Test 1/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938




‚û°Ô∏è MAE 33.824 | n=350, leaves=64, min_child=20, subs=0.8, col=0.8
üîÅ Test 2/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938




‚û°Ô∏è MAE 33.487 | n=350, leaves=64, min_child=20, subs=0.8, col=0.9
üîÅ Test 3/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013690 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938




‚û°Ô∏è MAE 33.824 | n=350, leaves=64, min_child=20, subs=0.9, col=0.8
üîÅ Test 4/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013941 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938




‚û°Ô∏è MAE 33.487 | n=350, leaves=64, min_child=20, subs=0.9, col=0.9
üîÅ Test 5/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015137 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938




‚û°Ô∏è MAE 33.878 | n=350, leaves=64, min_child=40, subs=0.8, col=0.8
üîÅ Test 6/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938




‚û°Ô∏è MAE 33.061 | n=350, leaves=64, min_child=40, subs=0.8, col=0.9
üîÅ Test 7/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938




‚û°Ô∏è MAE 33.878 | n=350, leaves=64, min_child=40, subs=0.9, col=0.8
üîÅ Test 8/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014784 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938




‚û°Ô∏è MAE 33.061 | n=350, leaves=64, min_child=40, subs=0.9, col=0.9
üîÅ Test 9/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938




‚û°Ô∏è MAE 33.833 | n=350, leaves=64, min_child=60, subs=0.8, col=0.8
üîÅ Test 10/192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015006 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 734
[LightGBM] [Info] Number of data points in the train set: 914443, number of used features: 52
[LightGBM] [Info] Start training from score 265.735938


KeyboardInterrupt: 