In [53]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import RandomForestRegressor

import joblib


In [54]:
df_hist = pd.read_csv(
    "../data/processed/data_feature_zeit_3_gesamt.csv",
    parse_dates=[
        "Auftragseingang","Auftragsende_SOLL",
        "AFO_Start_SOLL","AFO_Ende_SOLL",
        "AFO_Start_IST","AFO_Ende_IST",
        "Auftragsende_IST"
    ],
    low_memory=False
)

df_ids = pd.read_csv("../data/raw/df_IDs_for_eval_2025-11-03.csv")

print("History:", df_hist.shape)
print("Eval IDs:", df_ids.shape)
df_hist.head()

History: (1465664, 40)
Eval IDs: (8546, 1)


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Auftragseingang,Priorit√§t,Auftragsende_SOLL,Arbeitsschritt,Arbeitsschrittbezeichnung,AFO_Start_SOLL,AFO_Ende_SOLL,...,Auftrags_Laufzeit_Abweichung_Tage,Wartezeit_vor_Beginn_Tage,Pufferzeit_geplant_Tage,AFO_Start_Wochentag_Num,AFO_Start_Stunde,AFO_Kalenderwoche,AFO_Jahr,AFO_Ende_Stunde,AFO_Schicht,Lieferabweichung_Stunden
0,1,1,Steuerventilmodul,2013-10-29,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.879861,64.291667,64.291667,2,7,1,2014,7,Fr√ºh,-4.516667
1,2,1,Steuerventilmodul,2013-08-16,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.897222,138.291667,138.291667,2,7,1,2014,7,Fr√ºh,-4.516667
2,3,1,Steuerventilmodul,2013-08-05,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.995139,149.291667,149.291667,2,7,1,2014,7,Fr√ºh,-4.516667
3,4,1,Steuerventilmodul,2013-10-12,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.907639,81.291667,81.291667,2,7,1,2014,7,Fr√ºh,-4.516667
4,5,1,Steuerventilmodul,2013-10-03,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.899306,90.291667,90.291667,2,7,1,2014,7,Fr√ºh,-4.516667


In [55]:
df_orders = (
    df_hist.sort_values(["AuftragsID", "AFO_Ende_IST"])
    .groupby("AuftragsID")
    .agg({
        "BauteilID": "first",
        "Bauteilbezeichnung": "first",
        "Priorit√§t": "first",
        "Auftragseingang": "first",
        "Auftragsende_SOLL": "first",
        "Auftragsende_IST": "max",
        "Arbeitsschritt": "max",
        "AFO_Start_IST": "min",
        "AFO_Ende_IST": "max",
        "AFO_Dauer_IST_Stunde": "sum"
    })
    .reset_index()
)

df_orders["target_days"] = (
    df_orders["Auftragsende_IST"] - df_orders["Auftragseingang"]
).dt.total_seconds() / 86400

print(df_orders.shape)
df_orders.head()

(150368, 12)


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Priorit√§t,Auftragseingang,Auftragsende_SOLL,Auftragsende_IST,Arbeitsschritt,AFO_Start_IST,AFO_Ende_IST,AFO_Dauer_IST_Stunde,target_days
0,1,1,Steuerventilmodul,1,2013-10-29,2014-01-01 11:32:00,2014-05-07 08:39:00,999,2014-01-01 07:00:00,2014-05-07 08:39:00,4.89,190.360417
1,2,1,Steuerventilmodul,1,2013-08-16,2014-01-01 11:32:00,2014-05-07 09:04:00,999,2014-01-01 07:00:00,2014-05-07 09:04:00,5.01,264.377778
2,3,1,Steuerventilmodul,1,2013-08-05,2014-01-01 11:32:00,2014-05-07 11:25:00,999,2014-01-01 07:00:00,2014-05-07 11:25:00,7.82,275.475694
3,4,1,Steuerventilmodul,1,2013-10-12,2014-01-01 11:32:00,2014-05-07 09:19:00,999,2014-01-01 07:00:00,2014-05-07 09:19:00,5.12,207.388194
4,5,1,Steuerventilmodul,1,2013-10-03,2014-01-01 11:32:00,2014-05-07 09:07:00,999,2014-01-01 07:00:00,2014-05-07 09:07:00,5.41,216.379861


In [56]:
df_train = df_orders.dropna(subset=["target_days"])

X = df_train.drop(columns=["target_days", "Auftragsende_IST"])
y = df_train["target_days"]

num_cols = X.select_dtypes(include=["number"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

num_cols, cat_cols

(Index(['AuftragsID', 'BauteilID', 'Priorit√§t', 'Arbeitsschritt',
        'AFO_Dauer_IST_Stunde'],
       dtype='object'),
 Index(['Bauteilbezeichnung'], dtype='object'))

In [57]:
preprocess = ColumnTransformer(
    [
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

X_train.shape, X_test.shape

((106366, 10), (35456, 10))

In [58]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_dist = {
    "model__n_estimators": [80, 120, 180],
    "model__max_depth": [8, 12, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", 0.5],
}

pipe = Pipeline([
    ("prep", preprocess),
    ("model", rf)
])

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=8,
    cv=2,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=2,
    random_state=42
)

print("üîµ Starte RandomizedSearchCV ‚Ä¶")
search.fit(X_train, y_train)
print("‚úÖ Hyperparameter Suche fertig!")

üîµ Starte RandomizedSearchCV ‚Ä¶
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] END model__max_depth=12, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=120; total time=   4.2s
[CV] END model__max_depth=12, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=120; total time=   4.4s
[CV] END model__max_depth=None, model__max_features=0.5, model__min_samples_leaf=4, model__min_samples_split=5, model__n_estimators=180; total time=   6.8s
[CV] END model__max_depth=12, model__max_features=0.5, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=120; total time=   6.0s
[CV] END model__max_depth=12, model__max_features=0.5, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=120; total time=   6.1s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimator

In [59]:
print("BEST PARAMS:", search.best_params_)

best_model = search.best_estimator_

preds = best_model.predict(X_test)
mae = mean_absolute_error(y_test, preds)

print("üìâ TEST MAE:", mae)

BEST PARAMS: {'model__n_estimators': 120, 'model__min_samples_split': 5, 'model__min_samples_leaf': 2, 'model__max_features': 0.5, 'model__max_depth': 12}
üìâ TEST MAE: 52.06895290141735


In [60]:
model_dir = "../models/tree/pipeline"
os.makedirs(model_dir, exist_ok=True)

model_path = os.path.join(model_dir, "best_random_forest_pipeline.pkl")
joblib.dump(best_model, model_path)

print("üíæ Modell gespeichert unter:", model_path)

üíæ Modell gespeichert unter: ../models/tree/pipeline/best_random_forest_pipeline.pkl


### submission

In [61]:
df_submit_base = df_ids.merge(df_orders, on="AuftragsID", how="left")
print(df_submit_base.shape)
df_submit_base.head()

(8546, 12)


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Priorit√§t,Auftragseingang,Auftragsende_SOLL,Auftragsende_IST,Arbeitsschritt,AFO_Start_IST,AFO_Ende_IST,AFO_Dauer_IST_Stunde,target_days
0,144502,1,Steuerventilmodul,1,2023-09-10,2023-10-02 11:52:00,NaT,100,2023-10-02 07:00:00,2023-10-04 08:40:00,13.25,
1,147886,1,Steuerventilmodul,1,2023-12-06,2023-12-27 12:22:00,NaT,100,2023-12-27 07:00:00,2023-12-27 11:52:00,3.62,
2,135024,1,Steuerventilmodul,1,2023-02-22,2023-03-01 12:42:00,NaT,100,2023-03-01 07:00:00,2023-11-17 14:49:00,8.9,
3,135000,2,Schwenkzylinder,1,2022-10-30,2023-02-28 14:02:00,NaT,100,2023-02-28 07:00:00,2023-11-21 09:16:00,14.77,
4,146714,2,Schwenkzylinder,1,2023-08-23,2023-12-01 13:12:00,NaT,100,2023-12-01 07:00:00,2023-12-01 11:58:00,4.97,


In [62]:
X_submit = df_submit_base[X.columns]

pred_days_submit = best_model.predict(X_submit)

df_submit_base["Auftragsende_PREDICTED"] = (
    df_submit_base["Auftragseingang"] + pd.to_timedelta(pred_days_submit, unit="D")
).dt.strftime("%Y-%m-%d")

In [63]:
df_submit_base["ID"] = np.arange(1, len(df_submit_base) + 1)

df_submission = df_submit_base[["ID", "AuftragsID", "Auftragsende_PREDICTED"]]

os.makedirs("../submissions", exist_ok=True)
sub_path = "../submissions/submission_rf_tuned.csv"

df_submission.to_csv(sub_path, index=False)

print("‚úÖ Submission gespeichert:", sub_path)
df_submission.head()

‚úÖ Submission gespeichert: ../submissions/submission_rf_tuned.csv


Unnamed: 0,ID,AuftragsID,Auftragsende_PREDICTED
0,1,144502,2024-08-18
1,2,147886,2024-11-03
2,3,135024,2024-03-03
3,4,135000,2023-11-21
4,5,146714,2024-08-18
