In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor
import joblib
import os

In [23]:
df_hist = pd.read_csv(
    "../data/processed/data_feature_zeit_3_gesamt.csv",
    parse_dates=[
        "Auftragseingang","Auftragsende_SOLL","AFO_Start_SOLL","AFO_Ende_SOLL",
        "AFO_Start_IST","AFO_Ende_IST","Auftragsende_IST"
    ],
    low_memory=False
)

df_ids = pd.read_csv("../data/raw/df_IDs_for_eval_2025-11-03.csv")
df_hist.head()

Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Auftragseingang,Priorit√§t,Auftragsende_SOLL,Arbeitsschritt,Arbeitsschrittbezeichnung,AFO_Start_SOLL,AFO_Ende_SOLL,...,Auftrags_Laufzeit_Abweichung_Tage,Wartezeit_vor_Beginn_Tage,Pufferzeit_geplant_Tage,AFO_Start_Wochentag_Num,AFO_Start_Stunde,AFO_Kalenderwoche,AFO_Jahr,AFO_Ende_Stunde,AFO_Schicht,Lieferabweichung_Stunden
0,1,1,Steuerventilmodul,2013-10-29,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.879861,64.291667,64.291667,2,7,1,2014,7,Fr√ºh,-4.516667
1,2,1,Steuerventilmodul,2013-08-16,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.897222,138.291667,138.291667,2,7,1,2014,7,Fr√ºh,-4.516667
2,3,1,Steuerventilmodul,2013-08-05,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.995139,149.291667,149.291667,2,7,1,2014,7,Fr√ºh,-4.516667
3,4,1,Steuerventilmodul,2013-10-12,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.907639,81.291667,81.291667,2,7,1,2014,7,Fr√ºh,-4.516667
4,5,1,Steuerventilmodul,2013-10-03,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.899306,90.291667,90.291667,2,7,1,2014,7,Fr√ºh,-4.516667


In [24]:
df_orders = (
    df_hist.sort_values(["AuftragsID", "AFO_Ende_IST"])
    .groupby("AuftragsID")
    .agg({
        "BauteilID": "first",
        "Bauteilbezeichnung": "first",
        "Priorit√§t": "first",
        "Auftragseingang": "first",
        "Auftragsende_SOLL": "first",
        "Auftragsende_IST": "max",
        "Arbeitsschritt": "max",
        "AFO_Start_IST": "min",
        "AFO_Ende_IST": "max",
        "AFO_Dauer_IST_Stunde": "sum"
    })
    .reset_index()
)

In [25]:
df_orders["target_days"] = (
    df_orders["Auftragsende_IST"] - df_orders["Auftragseingang"]
).dt.total_seconds() / 86400

df_orders.head()

Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Priorit√§t,Auftragseingang,Auftragsende_SOLL,Auftragsende_IST,Arbeitsschritt,AFO_Start_IST,AFO_Ende_IST,AFO_Dauer_IST_Stunde,target_days
0,1,1,Steuerventilmodul,1,2013-10-29,2014-01-01 11:32:00,2014-05-07 08:39:00,999,2014-01-01 07:00:00,2014-05-07 08:39:00,4.89,190.360417
1,2,1,Steuerventilmodul,1,2013-08-16,2014-01-01 11:32:00,2014-05-07 09:04:00,999,2014-01-01 07:00:00,2014-05-07 09:04:00,5.01,264.377778
2,3,1,Steuerventilmodul,1,2013-08-05,2014-01-01 11:32:00,2014-05-07 11:25:00,999,2014-01-01 07:00:00,2014-05-07 11:25:00,7.82,275.475694
3,4,1,Steuerventilmodul,1,2013-10-12,2014-01-01 11:32:00,2014-05-07 09:19:00,999,2014-01-01 07:00:00,2014-05-07 09:19:00,5.12,207.388194
4,5,1,Steuerventilmodul,1,2013-10-03,2014-01-01 11:32:00,2014-05-07 09:07:00,999,2014-01-01 07:00:00,2014-05-07 09:07:00,5.41,216.379861


In [26]:
df_train = df_orders.dropna(subset=["target_days"])

X = df_train.drop(columns=["target_days", "Auftragsende_IST"])
y = df_train["target_days"]

In [27]:
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numerisch:", num_cols)
print("Kategorisch:", cat_cols)

Numerisch: ['AuftragsID', 'BauteilID', 'Priorit√§t', 'Arbeitsschritt', 'AFO_Dauer_IST_Stunde']
Kategorisch: ['Bauteilbezeichnung']


In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### model train

In [29]:
# =====================================
# 1) Datetime ‚Üí float
# =====================================
dt_cols = X.columns[X.dtypes == "datetime64[ns]"].tolist()
print("Datetime-Spalten:", dt_cols)

for col in dt_cols:
    X[col] = pd.to_datetime(X[col], errors="coerce").astype("int64") / 86400e9

print("√úbrig datetime:", X.dtypes[X.dtypes=="datetime64[ns]"])

# =====================================
# 2) Object ‚Üí Category (f√ºr LightGBM)
# =====================================
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("Object-Spalten:", cat_cols)

for col in cat_cols:
    X[col] = X[col].astype("category")

print("√úbrig object:", X.dtypes[X.dtypes=="object"])

# =====================================
# 3) Train/Test Split NACH DER KONVERTIERUNG
# =====================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# =====================================
# 4) LightGBM trainieren
# =====================================
lgbm = LGBMRegressor(
    n_estimators=400,
    learning_rate=0.05,
    num_leaves=31,
    random_state=42
)

lgbm.fit(
    X_train,
    y_train,
    categorical_feature=cat_cols
)

preds = lgbm.predict(X_test)
mae = mean_absolute_error(y_test, preds)

print("üìâ MAE:", mae)

Datetime-Spalten: ['Auftragseingang', 'Auftragsende_SOLL', 'AFO_Start_IST', 'AFO_Ende_IST']
√úbrig datetime: Series([], dtype: object)
Object-Spalten: ['Bauteilbezeichnung']
√úbrig object: Series([], dtype: object)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1542
[LightGBM] [Info] Number of data points in the train set: 113457, number of used features: 9
[LightGBM] [Info] Start training from score 262.910876
üìâ MAE: 8.980093842538821


In [30]:
preds = lgbm.predict(X_test)

mae = mean_absolute_error(y_test, preds)
print("üìâ LightGBM BASELINE ‚Äî MAE:", mae)

üìâ LightGBM BASELINE ‚Äî MAE: 8.980093842538821


In [31]:
os.makedirs("../models", exist_ok=True)

MODEL_PATH = "../models/lightgbm_baseline.pkl"
joblib.dump((lgbm, num_cols, cat_cols), MODEL_PATH)

print("üíæ Modell gespeichert unter:", MODEL_PATH)

üíæ Modell gespeichert unter: ../models/lightgbm_baseline.pkl


### submission

In [32]:
# Evaluation-Daten laden
df_public = pd.read_csv("../data/raw/df_eval_public_2025-11-03.csv")
df_private = pd.read_csv("../data/raw/df_eval_private_2025-11-03.csv")

df_eval = pd.concat([df_public, df_private], ignore_index=True)
df_eval.shape

(8546, 13)

In [33]:
df_submit = df_ids.merge(df_orders, on="AuftragsID", how="left")

In [46]:
# 0. Datetime zu numeric f√ºr df_submit
for col in df_submit.select_dtypes(include="datetime"):
    df_submit[col + "_num"] = df_submit[col].view("int64") // 86400000000000


# 1. fehlende Spalten korrekt erg√§nzen (NICHT die Datetime-Spalten √ºberschreiben!)
for col in X.columns:
    if col not in df_submit.columns:
        print(f"‚ö†Ô∏è Erg√§nze fehlende Spalte: {col}")

        # Falls im Training eine Datumsspalte ‚Üí als numeric erg√§nzen
        if col.endswith("_num"):
            df_submit[col] = 0   # numeric OK

        # Falls die Original-Datetime-Spalten fehlen ‚Üí wiederherstellen NICHT m√∂glich!
        elif col in ["Auftragseingang", "Auftragsende_SOLL", 
                     "AFO_Start_IST", "AFO_Ende_IST"]:
            raise ValueError(f"‚ùå FATAL: Datetime-Spalte '{col}' fehlt in Eval-Daten!")

        # Kategorisch ‚Üí NaN
        elif col in cat_cols:
            df_submit[col] = pd.Series([np.nan] * len(df_submit), dtype="category")

        # Numerisch ‚Üí NaN
        else:
            df_submit[col] = np.nan


# 2. Kategorien synchronisieren
for col in cat_cols:
    X[col] = X[col].astype("category")
    df_submit[col] = df_submit[col].astype("category")
    df_submit[col] = df_submit[col].cat.set_categories(X[col].cat.categories)


# 3. Predict
pred_days = lgbm.predict(
    df_submit[X.columns],
    validate_features=False
)


# 4. Datum berechnen
df_submit["Auftragsende_PREDICTED"] = (
    pd.to_datetime(df_submit["Auftragseingang"], errors="coerce")
    + pd.to_timedelta(pred_days, unit="D")
).dt.strftime("%Y-%m-%d")

In [47]:
df_submit["ID"] = np.arange(1, len(df_submit) + 1)

submission = df_submit[["ID", "AuftragsID", "Auftragsende_PREDICTED"]]

In [48]:
os.makedirs("../submissions", exist_ok=True)

submission_path = "../submissions/lightgbm_basic.csv"
submission.to_csv(submission_path, index=False)

print("‚úÖ Submission gespeichert:", submission_path)
submission.head()

‚úÖ Submission gespeichert: ../submissions/lightgbm_basic.csv


Unnamed: 0,ID,AuftragsID,Auftragsende_PREDICTED
0,1,144502,1971-03-01
1,2,147886,1971-02-06
2,3,135024,1971-02-18
3,4,135000,1971-02-25
4,5,146714,1971-02-08
