In [94]:
[col for col in df.columns if "IST" in col or "SOLL" in col or "dauer" in col.lower()]

['Auftragsende_SOLL',
 'AFO_Start_SOLL',
 'AFO_Ende_SOLL',
 'AFO_Start_IST',
 'AFO_Ende_IST',
 'Auftragsende_IST']

In [95]:
import sys, os
sys.path.append(os.path.abspath(".."))

In [96]:
import os
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from lightgbm import LGBMRegressor

In [97]:
df = pd.read_csv("../data/processed/data_cleaned_3.csv")
print(df.shape)
df.head()

(1393700, 15)


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Auftragseingang,Priorit√§t,Auftragsende_SOLL,Arbeitsschritt,Arbeitsschrittbezeichnung,AFO_Start_SOLL,AFO_Ende_SOLL,AFO_Start_IST,AFO_Ende_IST,MaschinenID,Maschinenbezeichnung,Auftragsende_IST
0,1,1,Steuerventilmodul,2013-10-29,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 08:39:00
1,2,1,Steuerventilmodul,2013-08-16,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 09:04:00
2,3,1,Steuerventilmodul,2013-08-05,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 11:25:00
3,4,1,Steuerventilmodul,2013-10-12,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 09:19:00
4,5,1,Steuerventilmodul,2013-10-03,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 09:07:00


In [98]:
def aggregate_orders(df):

    agg = df.groupby("AuftragsID").agg({
        "Auftragseingang": "min",
        "BauteilID": "first",
        "Bauteilbezeichnung": "first",
        "Priorit√§t": "first",
        "Arbeitsschritt": ["nunique", "max"],
        "MaschinenID": "first",
        "Maschinenbezeichnung": "first",
        "Auftragsende_IST": "max"   # TARGET
    })

    agg.columns = [
        "Auftragseingang",
        "BauteilID",
        "Bauteilbezeichnung",
        "Priorit√§t",
        "Anzahl_Arbeitsschritte",
        "Max_Arbeitsschritt",
        "MaschinenID",
        "Maschinenbezeichnung",
        "Auftragsende_IST"
    ]

    return agg.reset_index()

In [99]:
df_orders = aggregate_orders(df)
df_orders.head()

Unnamed: 0,AuftragsID,Auftragseingang,BauteilID,Bauteilbezeichnung,Priorit√§t,Anzahl_Arbeitsschritte,Max_Arbeitsschritt,MaschinenID,Maschinenbezeichnung,Auftragsende_IST
0,1,2013-10-29,1,Steuerventilmodul,1,6,999,1.0,EWM,2014-05-07 08:39:00
1,2,2013-08-16,1,Steuerventilmodul,1,6,999,1.0,EWM,2014-05-07 09:04:00
2,3,2013-08-05,1,Steuerventilmodul,1,7,999,1.0,EWM,2014-05-07 11:25:00
3,4,2013-10-12,1,Steuerventilmodul,1,6,999,1.0,EWM,2014-05-07 09:19:00
4,5,2013-10-03,1,Steuerventilmodul,1,6,999,1.0,EWM,2014-05-07 09:07:00


In [100]:
df_orders["Auftragsende_IST"] = pd.to_datetime(df_orders["Auftragsende_IST"])
df_orders["Auftragseingang"] = pd.to_datetime(df_orders["Auftragseingang"])

# target in int64
y = df_orders["Auftragsende_IST"].astype("int64")

drop_cols = ["Auftragsende_IST", "AuftragsID"]
X = df_orders.drop(columns=drop_cols)

# remove datetime except start
for col in X.select_dtypes(include=["datetime64[ns]"]):
    if col != "Auftragseingang":
        X = X.drop(columns=[col])

In [101]:
# ============================
# ZELLE 6 ‚Äì Train/Test Split
# ============================

# Auftragseingang sicher zu datetime parsen
data["Auftragseingang"] = pd.to_datetime(data["Auftragseingang"], errors="coerce")

# Target zu datetime ‚Üí int64
data["Auftragsende_IST"] = pd.to_datetime(data["Auftragsende_IST"], errors="coerce")
data = data.dropna(subset=["Auftragsende_IST"]).copy()

# Feature/Target trennen
X = data.drop(columns=["Auftragsende_IST"])
y = data["Auftragsende_IST"].astype("int64")   # datetime ‚Üí ns-int

# === FIX: Datum korrekt in int64 umwandeln ===
X["AE_ts"] = data["Auftragseingang"].view("int64")

# Alte datetime-Spalte entfernen
X = X.drop(columns=["Auftragseingang"])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Train:", X_train.shape, "Test:", X_test.shape)

  X["AE_ts"] = data["Auftragseingang"].view("int64")


Train: (1114960, 14) Test: (278740, 14)


In [102]:
# ============================
# ZELLE 7 ‚Äì Preprocessing + Model
# ============================

# Spalten bestimmen (von X, nicht von X_train!)
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print("Kategorial:", cat_cols)
print("Numerisch:", num_cols)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", drop="first"), cat_cols),
        ("num", SimpleImputer(strategy="median"), num_cols)
    ]
)

model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.03,
    num_leaves=31,
    max_depth=-1,
    min_child_samples=40,
    subsample=0.9,
    colsample_bytree=0.9,
    n_jobs=-1,
    random_state=42
)

pipe = Pipeline([
    ("prep", preprocessor),
    ("model", model)
])

print("üîµ LightGBM Training startet...")
pipe.fit(X_train, y_train)

preds = pipe.predict(X_test)
print("Training fertig!")

Kategorial: ['Bauteilbezeichnung', 'Auftragsende_SOLL', 'Arbeitsschrittbezeichnung', 'AFO_Start_SOLL', 'AFO_Ende_SOLL', 'AFO_Start_IST', 'AFO_Ende_IST', 'Maschinenbezeichnung']
Numerisch: ['AuftragsID', 'BauteilID', 'Priorit√§t', 'Arbeitsschritt', 'MaschinenID', 'AE_ts']
üîµ LightGBM Training startet...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 8.347567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 60135
[LightGBM] [Info] Number of data points in the train set: 1114960, number of used features: 29804
[LightGBM] [Info] Start training from score 1552882093555626240.000000




Training fertig!


In [103]:
NS_PER_DAY = 24 * 60 * 60 * 1e9

mae = mean_absolute_error(y_test, preds) / NS_PER_DAY
rmse = np.sqrt(mean_squared_error(y_test, preds)) / NS_PER_DAY
r2 = r2_score(y_test, preds)

print("MAE (Tage):", mae)
print("RMSE (Tage):", rmse)
print("R¬≤:", r2)

MAE (Tage): 22.190289968136394
RMSE (Tage): 51.727691271554804
R¬≤: 0.9975433672361068


In [104]:
MODEL_PATH = "../models/lightgbm/NEWlgbm_fast_pipeline.pkl"

import joblib
joblib.dump(pipe, MODEL_PATH)

print("üì¶ Modell gespeichert unter:", MODEL_PATH)

üì¶ Modell gespeichert unter: ../models/lightgbm/NEWlgbm_fast_pipeline.pkl


In [105]:
print("\nüîç Pr√ºfe Datentyp-Probleme...")
for col in expected_cols:
    if col in df_eval.columns:
        print(f"{col}: {df_eval[col].dtype}")


üîç Pr√ºfe Datentyp-Probleme...
AuftragsID: int64
BauteilID: int64
Bauteilbezeichnung: object
Priorit√§t: int64
Auftragsende_SOLL: object
Arbeitsschritt: int64
Arbeitsschrittbezeichnung: object
AFO_Start_SOLL: object
AFO_Ende_SOLL: object
AFO_Start_IST: object
AFO_Ende_IST: float64
MaschinenID: object
Maschinenbezeichnung: object
AE_ts: object


In [106]:
print("üîé Feature-√úbersicht:")
print(" - Kategoriale Features:", len(cat_cols))
print(" - Numerische Features:", len(num_cols))
print(" - Gesamt:", len(cat_cols) + len(num_cols))

print("\nüìã Namen der Features:")
print("Numerische:", num_cols)
print("Kategoriale:", cat_cols)

üîé Feature-√úbersicht:
 - Kategoriale Features: 8
 - Numerische Features: 6
 - Gesamt: 14

üìã Namen der Features:
Numerische: ['AuftragsID', 'BauteilID', 'Priorit√§t', 'Arbeitsschritt', 'MaschinenID', 'AE_ts']
Kategoriale: ['Bauteilbezeichnung', 'Auftragsende_SOLL', 'Arbeitsschrittbezeichnung', 'AFO_Start_SOLL', 'AFO_Ende_SOLL', 'AFO_Start_IST', 'AFO_Ende_IST', 'Maschinenbezeichnung']


In [None]:
# ============================
# FINAL SUBMISSION PIPELINE
# ============================

import pandas as pd
import numpy as np
import joblib
from datetime import datetime

# -----------------------------
# Pfade
# -----------------------------
MODEL_PATH = "../models/lightgbm/NEWlgbm_fast_pipeline.pkl"
PUBLIC_PATH = "../data/raw/df_eval_public_2025-11-03.csv"
PRIVATE_PATH = "../data/raw/df_eval_private_2025-11-03.csv"
IDS_PATH = "../data/raw/df_IDs_for_eval_2025-11-03.csv"

OUTPUT_PATH = "MAIN_submission_lgbm.csv"


# -----------------------------
# Modell + Feature-Info laden
# -----------------------------
print("üì¶ Lade Modell...")
model = joblib.load(MODEL_PATH)

prep = model.named_steps["prep"]
expected_cols = prep.feature_names_in_.tolist()
print("üîç Modell erwartet Features:", len(expected_cols))


# -----------------------------
# Eval-Daten laden
# -----------------------------
print("üì• Lade Eval-Daten...")

df_public = pd.read_csv(PUBLIC_PATH)
df_private = pd.read_csv(PRIVATE_PATH)
df_ids = pd.read_csv(IDS_PATH)

df_eval = pd.concat([df_public, df_private], ignore_index=True)
print("Eval AFO-Level:", df_eval.shape)


# -----------------------------
# Reihenfolge √ºber IDs sicherstellen
# -----------------------------
df_eval = df_ids.merge(df_eval, on="AuftragsID", how="left")
print("Eval nach Merge:", df_eval.shape)


# -----------------------------
# 1) DATETIME ‚Üí INT64 TIMESTAMP
# -----------------------------
for col in df_eval.columns:
    if pd.api.types.is_datetime64_any_dtype(df_eval[col]):
        df_eval[col] = df_eval[col].astype("int64")
    # falls String-Daten, die ein Datum sind ‚Üí konvertieren
    elif df_eval[col].dtype == object:
        try:
            tmp = pd.to_datetime(df_eval[col], errors="coerce")
            if tmp.notna().sum() > 0:     # enth√§lt Datumswerte
                df_eval[col] = tmp.astype("int64")
        except:
            pass


# Falls Auftragseingang als datetime kam ‚Üí extra fix
if "Auftragseingang" in df_eval.columns:
    df_eval["Auftragseingang"] = pd.to_datetime(df_eval["Auftragseingang"], errors="coerce")
    df_eval["AE_ts"] = df_eval["Auftragseingang"].astype("int64")
    df_eval = df_eval.drop(columns=["Auftragseingang"], errors="ignore")


# -----------------------------
# 2) Fehlende Spalten erg√§nzen
# -----------------------------
for col in expected_cols:
    if col not in df_eval.columns:
        df_eval[col] = np.nan


# -----------------------------
# 3) ALLE NUMERISCHEN SPALTEN ‚Üí NUMERISCH
# -----------------------------
for col in expected_cols:
    # nur casten wenn n√∂tig
    if not pd.api.types.is_numeric_dtype(df_eval[col]):
        df_eval[col] = pd.to_numeric(df_eval[col], errors="coerce")


# -----------------------------
# 4) Finales Feature-Set
# -----------------------------
X_submit = df_eval[expected_cols]
print("üî¢ Finale Featureshape:", X_submit.shape)


# -----------------------------
# 5) Prediction
# -----------------------------
print("üîÆ Starte Prediction...")

preds_ns = model.predict(X_submit)  

preds_dates = pd.to_datetime(
    np.round(preds_ns).astype("int64"),
    errors="coerce"
).dt.strftime("%Y-%m-%d")


# -----------------------------
# 6) Finale Submission erstellen
# -----------------------------
submission = pd.DataFrame({
    "ID": np.arange(1, len(df_ids) + 1),
    "AuftragsID": df_ids["AuftragsID"],
    "Auftragsende": preds_dates
})

submission.to_csv(OUTPUT_PATH, index=False)

print("‚úÖ Submission erstellt:", OUTPUT_PATH)
submission.head()

üì¶ Lade Modell...
üîç Modell erwartet Features: 14
üì• Lade Eval-Daten...
Eval AFO-Level: (8546, 13)
Eval nach Merge: (8546, 13)
üî¢ Finale Featureshape: (8546, 14)
üîÆ Starte Prediction...


  tmp = pd.to_datetime(df_eval[col], errors="coerce")
  tmp = pd.to_datetime(df_eval[col], errors="coerce")
  tmp = pd.to_datetime(df_eval[col], errors="coerce")
  tmp = pd.to_datetime(df_eval[col], errors="coerce")


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''