In [1]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from lightgbm import LGBMRegressor

# Paths (vom notebooks-Ordner aus)
HIST_PATH = "../data/processed/data_feature_zeit_3_gesamt.csv"
IDS_PATH = "../data/raw/df_IDs_for_eval_2025-11-03.csv"
PUBLIC_PATH = "../data/raw/df_eval_public_2025-11-03.csv"
PRIVATE_PATH = "../data/raw/df_eval_private_2025-11-03.csv"

MODEL_DIR = "../models/lightgbm"
SUBMISSION_DIR = "../submissions"
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(SUBMISSION_DIR, exist_ok=True)

In [2]:
date_cols = [
    "Auftragseingang",
    "Auftragsende_SOLL",
    "AFO_Start_SOLL",
    "AFO_Ende_SOLL",
    "AFO_Start_IST",
    "AFO_Ende_IST",
    "Auftragsende_IST",
]

df_hist = pd.read_csv(
    HIST_PATH,
    parse_dates=date_cols,
    low_memory=False
)

df_ids = pd.read_csv(IDS_PATH)

print("df_hist shape:", df_hist.shape)
df_hist.head()

df_hist shape: (1465664, 40)


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Auftragseingang,Priorit√§t,Auftragsende_SOLL,Arbeitsschritt,Arbeitsschrittbezeichnung,AFO_Start_SOLL,AFO_Ende_SOLL,...,Auftrags_Laufzeit_Abweichung_Tage,Wartezeit_vor_Beginn_Tage,Pufferzeit_geplant_Tage,AFO_Start_Wochentag_Num,AFO_Start_Stunde,AFO_Kalenderwoche,AFO_Jahr,AFO_Ende_Stunde,AFO_Schicht,Lieferabweichung_Stunden
0,1,1,Steuerventilmodul,2013-10-29,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.879861,64.291667,64.291667,2,7,1,2014,7,Fr√ºh,-4.516667
1,2,1,Steuerventilmodul,2013-08-16,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.897222,138.291667,138.291667,2,7,1,2014,7,Fr√ºh,-4.516667
2,3,1,Steuerventilmodul,2013-08-05,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.995139,149.291667,149.291667,2,7,1,2014,7,Fr√ºh,-4.516667
3,4,1,Steuerventilmodul,2013-10-12,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.907639,81.291667,81.291667,2,7,1,2014,7,Fr√ºh,-4.516667
4,5,1,Steuerventilmodul,2013-10-03,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.899306,90.291667,90.291667,2,7,1,2014,7,Fr√ºh,-4.516667


In [3]:
# Aggregation auf Auftrags-Ebene (wie zuvor bei RF)
df_orders = (
    df_hist.sort_values(["AuftragsID", "AFO_Ende_IST"])
    .groupby("AuftragsID")
    .agg({
        "BauteilID": "first",
        "Bauteilbezeichnung": "first",
        "Priorit√§t": "first",
        "Auftragseingang": "first",
        "Auftragsende_SOLL": "first",
        "Auftragsende_IST": "max",
        "Arbeitsschritt": "max",
        "AFO_Start_IST": "min",
        "AFO_Ende_IST": "max",
        "AFO_Dauer_IST_Stunde": "sum"
    })
    .reset_index()
)

print("df_orders shape:", df_orders.shape)
df_orders.head()

df_orders shape: (150368, 11)


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Priorit√§t,Auftragseingang,Auftragsende_SOLL,Auftragsende_IST,Arbeitsschritt,AFO_Start_IST,AFO_Ende_IST,AFO_Dauer_IST_Stunde
0,1,1,Steuerventilmodul,1,2013-10-29,2014-01-01 11:32:00,2014-05-07 08:39:00,999,2014-01-01 07:00:00,2014-05-07 08:39:00,4.89
1,2,1,Steuerventilmodul,1,2013-08-16,2014-01-01 11:32:00,2014-05-07 09:04:00,999,2014-01-01 07:00:00,2014-05-07 09:04:00,5.01
2,3,1,Steuerventilmodul,1,2013-08-05,2014-01-01 11:32:00,2014-05-07 11:25:00,999,2014-01-01 07:00:00,2014-05-07 11:25:00,7.82
3,4,1,Steuerventilmodul,1,2013-10-12,2014-01-01 11:32:00,2014-05-07 09:19:00,999,2014-01-01 07:00:00,2014-05-07 09:19:00,5.12
4,5,1,Steuerventilmodul,1,2013-10-03,2014-01-01 11:32:00,2014-05-07 09:07:00,999,2014-01-01 07:00:00,2014-05-07 09:07:00,5.41


In [4]:
# Target: Dauer in Tagen
df_orders["target_days"] = (
    df_orders["Auftragsende_IST"] - df_orders["Auftragseingang"]
).dt.total_seconds() / 86400

# Nur abgeschlossene Auftr√§ge (Target vorhanden)
df_train = df_orders.dropna(subset=["target_days"]).copy()

print("Train-Auftr√§ge:", df_train.shape[0])
df_train[["AuftragsID", "target_days"]].head()

Train-Auftr√§ge: 141822


Unnamed: 0,AuftragsID,target_days
0,1,190.360417
1,2,264.377778
2,3,275.475694
3,4,207.388194
4,5,216.379861


In [5]:
# Hilfsfunktion: Datetime ‚Üí Tag als float (seit Epoch)
def add_datetime_numeric_features(df, cols):
    for c in cols:
        if c in df.columns:
            df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
    return df

datetime_base_cols = [
    "Auftragseingang",
    "Auftragsende_SOLL",
    "AFO_Start_IST",
    "AFO_Ende_IST",
    "Auftragsende_IST",
]

df_train = add_datetime_numeric_features(df_train, datetime_base_cols)
df_orders = add_datetime_numeric_features(df_orders, datetime_base_cols)  # wichtig f√ºr Submission

df_train.head()

  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970
  df[c + "_num"] = df[c].view("int64") / 86400e9  # Tage seit 1970


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Priorit√§t,Auftragseingang,Auftragsende_SOLL,Auftragsende_IST,Arbeitsschritt,AFO_Start_IST,AFO_Ende_IST,AFO_Dauer_IST_Stunde,target_days,Auftragseingang_num,Auftragsende_SOLL_num,AFO_Start_IST_num,AFO_Ende_IST_num,Auftragsende_IST_num
0,1,1,Steuerventilmodul,1,2013-10-29,2014-01-01 11:32:00,2014-05-07 08:39:00,999,2014-01-01 07:00:00,2014-05-07 08:39:00,4.89,190.360417,16007.0,16071.480556,16071.291667,16197.360417,16197.360417
1,2,1,Steuerventilmodul,1,2013-08-16,2014-01-01 11:32:00,2014-05-07 09:04:00,999,2014-01-01 07:00:00,2014-05-07 09:04:00,5.01,264.377778,15933.0,16071.480556,16071.291667,16197.377778,16197.377778
2,3,1,Steuerventilmodul,1,2013-08-05,2014-01-01 11:32:00,2014-05-07 11:25:00,999,2014-01-01 07:00:00,2014-05-07 11:25:00,7.82,275.475694,15922.0,16071.480556,16071.291667,16197.475694,16197.475694
3,4,1,Steuerventilmodul,1,2013-10-12,2014-01-01 11:32:00,2014-05-07 09:19:00,999,2014-01-01 07:00:00,2014-05-07 09:19:00,5.12,207.388194,15990.0,16071.480556,16071.291667,16197.388194,16197.388194
4,5,1,Steuerventilmodul,1,2013-10-03,2014-01-01 11:32:00,2014-05-07 09:07:00,999,2014-01-01 07:00:00,2014-05-07 09:07:00,5.41,216.379861,15981.0,16071.480556,16071.291667,16197.379861,16197.379861


In [6]:
# Spalten, die wir NICHT als Features verwenden
drop_cols = [
    "target_days",
    "Auftragsende_IST",         # enth√§lt Zielinfo
    "Auftragsende_IST_num",     # leakt Ziel direkt ‚Äì also raus!
]

# Auch die reinen Datetime-Spalten droppen (wir haben ja *_num)
drop_cols += datetime_base_cols

X = df_train.drop(columns=drop_cols, errors="ignore")
y = df_train["target_days"]

print("Feature-Spalten:", X.columns.tolist())
print("X shape:", X.shape)
print("y shape:", y.shape)

Feature-Spalten: ['AuftragsID', 'BauteilID', 'Bauteilbezeichnung', 'Priorit√§t', 'Arbeitsschritt', 'AFO_Dauer_IST_Stunde', 'Auftragseingang_num', 'Auftragsende_SOLL_num', 'AFO_Start_IST_num', 'AFO_Ende_IST_num']
X shape: (141822, 10)
y shape: (141822,)


In [7]:
# Numerische und kategoriale Spalten bestimmen
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("Numerische Spalten:", num_cols)
print("Kategoriale Spalten:", cat_cols)

Numerische Spalten: ['AuftragsID', 'BauteilID', 'Priorit√§t', 'Arbeitsschritt', 'AFO_Dauer_IST_Stunde', 'Auftragseingang_num', 'Auftragsende_SOLL_num', 'AFO_Start_IST_num', 'AFO_Ende_IST_num']
Kategoriale Spalten: ['Bauteilbezeichnung']


In [8]:
# Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, random_state=42
)

X_train.shape, X_val.shape

((106366, 10), (35456, 10))

In [10]:
# Preprocessing-Pipelines
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols),
    ]
)

# LightGBM-Basis
lgbm = LGBMRegressor(
    objective="regression",
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", lgbm),
])

# Hyperparameter-Suchraum (moderat, damit dein M4 nicht stirbt)
param_dist = {
    "model__n_estimators": [200, 400, 600],
    "model__learning_rate": [0.05, 0.03, 0.01],
    "model__num_leaves": [31, 63, 127],
    "model__max_depth": [-1, 8, 12, 16],
    "model__min_child_samples": [20, 50, 100],
    "model__subsample": [0.7, 0.9, 1.0],
    "model__colsample_bytree": [0.7, 0.9, 1.0],
    "model__reg_lambda": [0.0, 0.1, 1.0],
    "model__reg_alpha": [0.0, 0.1, 1.0],
}

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=5,                     
    cv=2,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=2,
    random_state=42
)

print("üîµ Starte LightGBM RandomizedSearchCV ‚Ä¶")
search.fit(X_train, y_train)
print("‚úÖ Hyperparameter-Suche fertig!")

üîµ Starte LightGBM RandomizedSearchCV ‚Ä¶
Fitting 2 folds for each of 5 candidates, totalling 10 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003753 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 53183, number of used features: 11
[LightGBM] [Info] Start training from score 263.166883
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 53183, number of used features: 11
[LightGBM] [Info] Start training from score 262.283027
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000394 seconds.
You can set `force_row_wise=true` to rem



[CV] END model__colsample_bytree=0.9, model__learning_rate=0.01, model__max_depth=8, model__min_child_samples=100, model__n_estimators=200, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=0.7; total time=   6.2s
[CV] END model__colsample_bytree=0.9, model__learning_rate=0.01, model__max_depth=8, model__min_child_samples=100, model__n_estimators=200, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=0.7; total time=   6.2s




[CV] END model__colsample_bytree=0.7, model__learning_rate=0.03, model__max_depth=16, model__min_child_samples=50, model__n_estimators=200, model__num_leaves=63, model__reg_alpha=0.1, model__reg_lambda=1.0, model__subsample=1.0; total time=  10.3s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.01, model__max_depth=-1, model__min_child_samples=50, model__n_estimators=400, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  10.4s




[CV] END model__colsample_bytree=0.7, model__learning_rate=0.03, model__max_depth=16, model__min_child_samples=50, model__n_estimators=200, model__num_leaves=63, model__reg_alpha=0.1, model__reg_lambda=1.0, model__subsample=1.0; total time=  11.4s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.01, model__max_depth=-1, model__min_child_samples=50, model__n_estimators=400, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  11.7s




[CV] END model__colsample_bytree=0.7, model__learning_rate=0.05, model__max_depth=8, model__min_child_samples=20, model__n_estimators=400, model__num_leaves=63, model__reg_alpha=1.0, model__reg_lambda=0.1, model__subsample=1.0; total time=  13.8s




[CV] END model__colsample_bytree=0.7, model__learning_rate=0.05, model__max_depth=8, model__min_child_samples=20, model__n_estimators=400, model__num_leaves=63, model__reg_alpha=1.0, model__reg_lambda=0.1, model__subsample=1.0; total time=  14.9s




[CV] END model__colsample_bytree=1.0, model__learning_rate=0.03, model__max_depth=8, model__min_child_samples=50, model__n_estimators=600, model__num_leaves=63, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  16.1s
[CV] END model__colsample_bytree=1.0, model__learning_rate=0.03, model__max_depth=8, model__min_child_samples=50, model__n_estimators=600, model__num_leaves=63, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  16.2s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 106366, number of used features: 11
[LightGBM] [Info] Start training from score 262.724955
‚úÖ Hyperparameter-Suche fertig!


In [11]:
best_lgbm = search.best_estimator_
print("üí° Beste Parameter:")
print(search.best_params_)

üí° Beste Parameter:
{'model__subsample': 1.0, 'model__reg_lambda': 0.1, 'model__reg_alpha': 1.0, 'model__num_leaves': 63, 'model__n_estimators': 400, 'model__min_child_samples': 20, 'model__max_depth': 8, 'model__learning_rate': 0.05, 'model__colsample_bytree': 0.7}


In [12]:
# Evaluation auf Validation-Set
val_preds = best_lgbm.predict(X_val)

mae = mean_absolute_error(y_val, val_preds)
rmse = np.sqrt(mean_squared_error(y_val, val_preds))
r2 = r2_score(y_val, val_preds)

print(f"üìâ Validation MAE (Tage): {mae:.3f}")
print(f"üìâ Validation RMSE (Tage): {rmse:.3f}")
print(f"üìà Validation R¬≤: {r2:.3f}")

üìâ Validation MAE (Tage): 8.354
üìâ Validation RMSE (Tage): 11.139
üìà Validation R¬≤: 0.995




In [13]:
# Bestes Param-Set extrahieren
best_params = search.best_params_

final_lgbm = LGBMRegressor(
    objective="regression",
    random_state=42,
    n_jobs=-1,
    # Params aus Suche:
    n_estimators=best_params["model__n_estimators"],
    learning_rate=best_params["model__learning_rate"],
    num_leaves=best_params["model__num_leaves"],
    max_depth=best_params["model__max_depth"],
    min_child_samples=best_params["model__min_child_samples"],
    subsample=best_params["model__subsample"],
    colsample_bytree=best_params["model__colsample_bytree"],
    reg_lambda=best_params["model__reg_lambda"],
    reg_alpha=best_params["model__reg_alpha"],
)

final_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", final_lgbm),
])

print("üîÅ Trainiere finales LightGBM-Modell auf ALLEN Trainingsdaten ‚Ä¶")
final_pipe.fit(X, y)
print("‚úÖ Finales Modell trainiert.")

model_path = os.path.join(MODEL_DIR, "lgbm_tuned_pipeline.pkl")
joblib.dump(final_pipe, model_path)
print(f"üíæ Modell gespeichert unter: {model_path}")

üîÅ Trainiere finales LightGBM-Modell auf ALLEN Trainingsdaten ‚Ä¶
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1544
[LightGBM] [Info] Number of data points in the train set: 141822, number of used features: 11
[LightGBM] [Info] Start training from score 262.640894
‚úÖ Finales Modell trainiert.
üíæ Modell gespeichert unter: ../models/lightgbm/lgbm_tuned_pipeline.pkl


### submission

In [14]:
# Eval-Daten laden (nur f√ºr evtl. sp√§tere Analysen ‚Äì f√ºr Submission reicht df_ids + df_orders)
df_public = pd.read_csv(PUBLIC_PATH)
df_private = pd.read_csv(PRIVATE_PATH)

print("df_public shape:", df_public.shape)
print("df_private shape:", df_private.shape)
print("df_ids shape:", df_ids.shape)

df_public shape: (4273, 13)
df_private shape: (4273, 13)
df_ids shape: (8546, 1)


In [15]:
# Merge: alle IDs, f√ºr die wir vorhersagen sollen, mit df_orders (Features)
df_submit = df_ids.merge(df_orders, on="AuftragsID", how="left")

print("df_submit shape:", df_submit.shape)
df_submit.head()

df_submit shape: (8546, 17)


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Priorit√§t,Auftragseingang,Auftragsende_SOLL,Auftragsende_IST,Arbeitsschritt,AFO_Start_IST,AFO_Ende_IST,AFO_Dauer_IST_Stunde,target_days,Auftragseingang_num,Auftragsende_SOLL_num,AFO_Start_IST_num,AFO_Ende_IST_num,Auftragsende_IST_num
0,144502,1,Steuerventilmodul,1,2023-09-10,2023-10-02 11:52:00,NaT,100,2023-10-02 07:00:00,2023-10-04 08:40:00,13.25,,19610.0,19632.494444,19632.291667,19634.361111,-106751.991167
1,147886,1,Steuerventilmodul,1,2023-12-06,2023-12-27 12:22:00,NaT,100,2023-12-27 07:00:00,2023-12-27 11:52:00,3.62,,19697.0,19718.515278,19718.291667,19718.494444,-106751.991167
2,135024,1,Steuerventilmodul,1,2023-02-22,2023-03-01 12:42:00,NaT,100,2023-03-01 07:00:00,2023-11-17 14:49:00,8.9,,19410.0,19417.529167,19417.291667,19678.617361,-106751.991167
3,135000,2,Schwenkzylinder,1,2022-10-30,2023-02-28 14:02:00,NaT,100,2023-02-28 07:00:00,2023-11-21 09:16:00,14.77,,19295.0,19416.584722,19416.291667,19682.386111,-106751.991167
4,146714,2,Schwenkzylinder,1,2023-08-23,2023-12-01 13:12:00,NaT,100,2023-12-01 07:00:00,2023-12-01 11:58:00,4.97,,19592.0,19692.55,19692.291667,19692.498611,-106751.991167


In [16]:
# X-Spalten aus dem Training
feature_cols = X.columns.tolist()

# Manche Auftr√§ge in df_ids k√∂nnten theoretisch nicht in df_orders vorkommen ‚Üí check
missing_rows = df_submit[feature_cols].isna().all(axis=1).sum()
print("Anzahl Zeilen ohne bekannte Features (alles NaN):", missing_rows)

Anzahl Zeilen ohne bekannte Features (alles NaN): 0


In [17]:
# Features f√ºr Submission
X_submit = df_submit[feature_cols].copy()

# Predictions (Tage)
pred_days = final_pipe.predict(X_submit)

# Fertigstellungsdatum = Auftragseingang + vorhergesagte Dauer in Tagen
# Achtung: Auftragseingang muss datetime sein
df_submit["Auftragseingang"] = pd.to_datetime(df_submit["Auftragseingang"], errors="coerce")

df_submit["Auftragsende_PREDICTED"] = (
    df_submit["Auftragseingang"] + pd.to_timedelta(pred_days, unit="D")
).dt.strftime("%Y-%m-%d")

# Kaggle verlangt: ID, AuftragsID, Auftragsende_PREDICTED
submission = df_submit[["AuftragsID", "Auftragsende_PREDICTED"]].copy()
submission["ID"] = np.arange(1, len(submission) + 1)

submission = submission[["ID", "AuftragsID", "Auftragsende_PREDICTED"]]

submission_path = os.path.join(SUBMISSION_DIR, "submission_lgbm_tuned.csv")
submission.to_csv(submission_path, index=False)

submission.head(), submission_path



(   ID  AuftragsID Auftragsende_PREDICTED
 0   1      144502             2024-04-09
 1   2      147886             2024-06-18
 2   3      135024             2023-12-16
 3   4      135000             2023-11-28
 4   5      146714             2024-04-13,
 '../submissions/submission_lgbm_tuned.csv')