# Bibliotheken

In [None]:
# Eigene Funktionen
from core.data import clean_data, drop_columns, engineer_features, load_competition_from_kaggle, memory_data, TopNCategoriesTransformer

# Datenmanipulation
import pandas as pd
import numpy as np
import joblib
import os

# Visualisierung
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
from sklearn.inspection import PartialDependenceDisplay

# Imbalanced-learn
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

# Hyperparameter Optimierung
from skopt import BayesSearchCV
from skopt.space import Integer, Real

# Warnungen unterdrücken
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action="ignore", category=DataConversionWarning)
warnings.filterwarnings(action="ignore", category=FutureWarning)

# User Input

In [None]:
# Verzeichnisse
SAVE_DIR = "../data/processed/"
MODEL_FILENAME = "best_model_pipeline.pkl"
MODEL_PATH = os.path.join(SAVE_DIR, MODEL_FILENAME)

# Hyperparameter-Optimierung durchführen
run_bayes_search = 0 # 1 -> BayesSearchCV, 0 -> Nicht durchführen

# Preparation

**Gather Data**

In [None]:
# Daten herunterladen (Kaggle Competition)
competition_name = "DontGetKicked"
destination = "../data/raw"

files = load_competition_from_kaggle(
    competition_name=competition_name,
    destination=destination,
)

# Trainingsdatei finden
train_file = [f for f in files if "training" in f.lower()][0]

# Einlesen der Daten
df = pd.read_csv("/".join([destination, competition_name, train_file]))
print(df.shape)
df.head()

# Explorative Datenanalayse (EDA)

**Understand Data**


In [None]:
# Erste Übersicht über die Daten (Dimensions, Beschreibung, Duplikate)
display(
    "Shape",
    df.shape,
    "Description",
    df.describe().round(2).T,
    "Duplicates",
    df.duplicated().sum(),
)

# Übersicht über die Spalten (Datentypen, fehlende Werte, eindeutige Werte, Beispielwerte)
pd.DataFrame(
    {
        "Data Types": df.dtypes,
        "Missing Values": df.isnull().sum(),
        "Unique Values": df.nunique(),
        "Sample Values": [df[col].sample(3, random_state=42).tolist() for col in df.columns],
    }
)

**Outliers Detection**

In [None]:
# Kategoriale Features: Verteilung nach Zielvariable
categorical_features = [
    "Auction",
    "Transmission",
    "WheelTypeID",
    "WheelType",
    "Nationality",
    "TopThreeAmericanName",
    "PRIMEUNIT",
    "AUCGUART",
    "IsOnlineSale",
]

for categorical_feature in categorical_features:
    # Füllen der fehlenden Werte mit "Missing"
    df_col = df[categorical_feature].fillna("Missing")

    # Kreuztabelle
    print(f"\n=== {categorical_feature} ===")
    print(pd.crosstab(df_col, df["IsBadBuy"]))

    # Visualisierung
    plt.figure(figsize=(16, 6))
    palette = ["#009292", "#074650"]
    sns.countplot(
        x=df_col,
        data=df,
        hue="IsBadBuy",
        stat="proportion",
        order=df_col.value_counts().index,
        palette=palette,
    )
    plt.title(f"Distribution of {categorical_feature}")
    plt.xlabel(None)
    plt.ylabel("Proportion")
    plt.show()


In [None]:
# Hochkardinalen kategoriale Features: Verteilung nach Zielvariable
high_cardinality_categorical_features = [
    "Make", 
    "Color", 
    "Size", 
    "VNST", 
    "Model", 
    "Trim", 
    "SubModel", 
    "BYRNO", 
    "VNZIP1"]
top_n = 10

for categorical_feature in high_cardinality_categorical_features:
    # Füllen der fehlenden Werte mit "Missing"
    df_col = df[categorical_feature].astype("str").fillna("Missing")

    # Top-N-Kategorien behalten, Rest als "Other" zusammenfassen
    top_n_categories = df_col.value_counts().head(top_n).index
    df_col = df_col.where(df_col.isin(top_n_categories), other="Other")

    # Kreuztabelle
    print(pd.crosstab(df_col, df["IsBadBuy"]))

    # Visualisierung
    plt.figure(figsize=(16, 6))
    palette = ["#009292", "#074650"]
    sns.countplot(
        x=df_col,
        data=df,
        hue="IsBadBuy",
        stat="proportion",
        order=df_col.value_counts().index,
        palette=palette,
    )
    plt.title(f"Distribution of {categorical_feature} (Top {top_n} + Other)")
    plt.xlabel(None)
    plt.ylabel("Proportion")
    plt.show()


In [None]:
# Numerische Features: Verteilung nach Zielvariable
numerical_features = [
    "MMRAcquisitionAuctionAveragePrice",
    "MMRAcquisitionAuctionCleanPrice",
    "MMRAcquisitionRetailAveragePrice",
    "MMRAcquisitonRetailCleanPrice",
    "MMRCurrentAuctionAveragePrice",
    "MMRCurrentAuctionCleanPrice",
    "MMRCurrentRetailAveragePrice",
    "MMRCurrentRetailCleanPrice",
    "VehBCost",
    "WarrantyCost",
    "VehOdo",
    "VehicleAge",
]

for numerical_feature in numerical_features:
    fig, axes = plt.subplots(1, 2, figsize=(16, 4))
    palette = ["#009292", "#074650"]

    sns.histplot(
        x=df[numerical_feature], 
        kde=True, ax=axes[0], 
        color=palette[0]
    )
    sns.boxplot(
        data=df, 
        x="IsBadBuy", 
        y=numerical_feature, 
        hue="IsBadBuy", 
        ax=axes[1], 
        palette=palette
    )
    axes[0].set_title(f"Histogram of {numerical_feature}")
    axes[1].set_title(f"Boxplot of {numerical_feature}")
    plt.show()


In [None]:
# Korrelationen
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="viridis", center=0, fmt=".2f")
plt.title("Korrelationsmatrix (numerische Features)")
plt.show()

# Zielverteilung
print("Verteilung des Targets (gesamt):")
print(df["IsBadBuy"].value_counts(normalize=True))

# Split

**Train-Test-Split**

In [None]:
# Train/Test-Split (stratifiziert wegen Klassenungleichgewicht)
target_col = "IsBadBuy"
features = df.drop(columns=target_col)
target = df[target_col]

features_train, features_test, target_train, target_test = train_test_split(
    features,
    target,
    random_state=42,
    test_size=0.1,
    stratify=target
)

# Überprüfung des Train-Test-Splits
print("Dimensionen der Trainingsdaten (Features):", features_train.shape)
print("Dimensionen der Testdaten (Features):", features_test.shape)
print("\nVerteilung des Targets im Trainings-Set:")
print(target_train.value_counts(normalize=True))
print("\nVerteilung des Targets im Test-Set:")
print(target_test.value_counts(normalize=True))

# Data Preparation

**Datatype Transformation**

In [None]:
# Clean data für die Modell-Pipeline vorbereiten
clean_step = FunctionTransformer(
    clean_data, 
    validate=False
)

# Bereinigung und Datentypkonvertierung der Trainings- undd Testdaten
features_train_clean = clean_data(features_train)

# Vergleich Datentypkonvertierung vorher und nachher
dtypes_before = features_train.dtypes
dtypes_after = features_train_clean.dtypes
dtype_comparison = pd.DataFrame({"dtypes_before": dtypes_before, "dtypes_after": dtypes_after})
display(dtype_comparison)

**Data Imputation**

In [None]:
# Feature Engineering auf bereinigte Daten anwenden
features_train_engineered = engineer_features(features_train_clean)

# Features trennen in numerische und kategoriale Spalten
numerical_features = features_train_engineered.select_dtypes(include=np.number).columns.tolist()
categorical_features = features_train_engineered.select_dtypes(include="object").columns.tolist()

# Kategorische Feature-Gruppen definieren
high_cardinality_categorical_features = [
    "Make",
    "Color",
    "Size",
    "VNST",
    "Model",
    "Trim",
    "SubModel",
    "BYRNO",
    "VNZIP1",
]

high_card_features = [
    col for col in high_cardinality_categorical_features if col in categorical_features
]
low_card_features = [col for col in categorical_features if col not in high_card_features]

mode_features = ["Transmission", "IsOnlineSale"]
low_card_mode_features = [col for col in low_card_features if col in mode_features]
low_card_missing_features = [
    col for col in low_card_features if col not in low_card_mode_features
]

# Imputation testen (ohne Encoding), um fehlende Werte zu kontrollieren
imputer = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numerical_features),
        ("cat_mode", SimpleImputer(strategy="most_frequent"), low_card_mode_features),
        (
            "cat_missing",
            SimpleImputer(strategy="constant", fill_value="Missing"),
            low_card_missing_features + high_card_features,
        ),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

features_train_imputed = imputer.fit_transform(features_train_engineered)

# Vergleich fehlende Werte vor und nach der Imputation
missing_values_before = features_train_engineered.isna().sum()
missing_values_after = features_train_imputed.isna().sum()
missing_values_comparison = pd.DataFrame(
    {
        "missing_values_before": missing_values_before,
        "missing_values_after": missing_values_after,
    }
)
display(missing_values_comparison)

# Modeling

**Feature Engineering**

In [None]:
# Feature Engineering für die Modell-Pipeline vorbereiten
feature_engineering_step = FunctionTransformer(
    engineer_features,
    validate=False
)

**Feature Selection**

In [None]:
# Feature selection für die Modell-Pipeline vorbereiten
feature_drop = ["PurchDate", "VehYear", "WheelTypeID", "VNZIP1", # redundant
                "BYRNO", # käufer id
                "IsOnlineSale", "Transmission", "Nationality" # feature importance / eda
               ]

feature_selection_step = FunctionTransformer(
    drop_columns, 
    kw_args={'cols_to_drop': feature_drop}, 
    validate=False
)

final_num_features = [col for col in numerical_features if col not in feature_drop]
final_cat_features = [col for col in categorical_features if col not in feature_drop]

**Data Scaling + Dimensonality Reduction**

In [None]:
# PCA-Features definieren und restliche numerische Features bestimmen
pca_cols = [
    "MMRAcquisitionAuctionAveragePrice",
    "MMRAcquisitionAuctionCleanPrice",
    "MMRAcquisitionRetailAveragePrice",
    "MMRAcquisitonRetailCleanPrice",
    "MMRCurrentAuctionAveragePrice",
    "MMRCurrentAuctionCleanPrice",
    "MMRCurrentRetailAveragePrice",
    "MMRCurrentRetailCleanPrice",
]

pca_numeric_features = [col for col in pca_cols if col in final_num_features]
non_pca_numeric_features = [col for col in final_num_features if col not in pca_numeric_features]

**OHE Encoding**

In [None]:
# Kategorische Feature-Gruppen definieren
high_cardinality_categorical_features = [
    "Make",
    "Color",
    "Size",
    "VNST",
    "Model",
    "Trim",
    "SubModel",
    "BYRNO",
    "VNZIP1",
]

high_card_features = [
    col for col in high_cardinality_categorical_features if col in final_cat_features
]
low_card_features = [col for col in final_cat_features if col not in high_card_features]

mode_features = ["Transmission", "IsOnlineSale"]
low_card_mode_features = [col for col in low_card_features if col in mode_features]
low_card_missing_features = [
    col for col in low_card_features if col not in low_card_mode_features
]

**Arbeitsspeicher Optimierung**

In [None]:
# Arbeitsspeicher Optimierung für die Modell-Pipeline vorbereiten
def apply_memory_data(X: pd.DataFrame) -> pd.DataFrame:
    """Wrapper to make memory_data usable in sklearn pipelines."""
    return memory_data(X)

memory_step = FunctionTransformer(apply_memory_data, validate=False)

# Modeling

**Baseline Model**

In [None]:
# Preprocessing- und Modell-Pipeline (inkl. PCA und Resampling)
numeric_transformer_non_pca = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

numeric_transformer_pca = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=0.95, random_state=42)),
    ]
)

categorical_transformer_low_card_mode = Pipeline(
    steps=[
        ("imputer_mode", SimpleImputer(strategy="most_frequent")),
        ("encoder_mode", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

categorical_transformer_low_card_missing = Pipeline(
    steps=[
        ("imputer_missing", SimpleImputer(strategy="constant", fill_value="Missing")),
        ("encoder_missing", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

categorical_transformer_high_card = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
        ("top_n", TopNCategoriesTransformer(top_n=19)),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_non_pca, non_pca_numeric_features),
        ("pca", numeric_transformer_pca, pca_numeric_features),
        ("cat_high_card", categorical_transformer_high_card, high_card_features),
        ("cat_mode", categorical_transformer_low_card_mode, low_card_mode_features),
        ("cat_missing", categorical_transformer_low_card_missing, low_card_missing_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

model_pipeline = ImbPipeline(
    steps=[
        ("clean", clean_step),
        ("feature_engineering", feature_engineering_step),
        ("preprocess", preprocessor),
        ("memory", memory_step),
        ("sampler", RandomUnderSampler(random_state=42)),
        ("model", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)),
    ]
)

model_pipeline

**Modell Evaluierung**

In [None]:
# Mehrere Modelle testen
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(random_state=42)
}

results = []

for model_name, model in models.items():
    model_pipeline = ImbPipeline(
        steps=[
            ("clean", clean_step),
            ("feature_engineering", feature_engineering_step),
            ("feature_selection", feature_selection_step),
            ("preprocess", preprocessor),
            ("sampler", RandomUnderSampler(random_state=42)),
            ("model", model),
        ]
    )

    model_pipeline.fit(features_train, target_train)
    target_test_pred = model_pipeline.predict(features_test)

    results.append(
        {
            "Model": model_name,
            "Precision": precision_score(target_test, target_test_pred),
            "Recall": recall_score(target_test, target_test_pred),
            "F1": f1_score(target_test, target_test_pred),
        }
    )

results_df = pd.DataFrame(results).sort_values(by="Recall", ascending=False)
display(results_df)

**Optimierung der Hyperparameter mit BayesSearchCV**

In [None]:
if run_bayes_search == 1:
    # Pipeline
    pipeline_final = ImbPipeline(
            steps=[
                ("clean", clean_step),
                ("feature_engineering", feature_engineering_step),
                ("feature_selection", feature_selection_step),
                ("preprocess", preprocessor),
                ("sampler", RandomUnderSampler(random_state=42)),
                ("model", XGBClassifier(random_state=42)),
            ]
        )

    # Suchraum für die Hyperparameter
    search_space_xgb = {
        # Anzahl der Bäume
        "model__n_estimators": Integer(50, 500),
        # Maximale Tiefe
        "model__max_depth": Integer(3, 15),
        # Lernrate
        "model__learning_rate": Real(0.01, 0.3, prior='log-uniform'),
        # Instanz-Gewichte
        "model__min_child_weight": Integer(1, 10),
        # Subsample Ratio der Trainingsinstanzen (gegen Overfitting)
        "model__subsample": Real(0.5, 1.0),
        # Subsample Ratio der Spalten pro Baum
        "model__colsample_bytree": Real(0.5, 1.0),
        # Minimum loss reduction (Regularisierung)
        "model__gamma": Real(0, 5)
    }

    # Stratified K-Fold Cross-Validation
    cv_stratified = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Bayesianische Optimierung mit BayesSearchCV
    bayes_model_final = BayesSearchCV(estimator=pipeline_final,
                                search_spaces=search_space_xgb,
                                n_iter=100,
                                scoring="f1",
                                cv=cv_stratified,
                                n_jobs=-1,
                                random_state=42)

    # Fitting auf Trainingsdaten
    bayes_model_final.fit(features_train, target_train)

    # Bestes Modell
    best_model_final = bayes_model_final.best_estimator_

    # Vorhersage auf Testdaten
    target_test_pred_final = best_model_final.predict(features_test)

    print(f"Optimierte Hyperparameter für RandomForestClassifier :\n{bayes_model_final.best_params_}")
    print("\nClassification report")
    report_final = classification_report(target_test, target_test_pred_final)
    print(report_final)

    # Speichern des besten Modells (fertige Pipeline)
    joblib.dump(best_model_final, MODEL_PATH)
    print("Modell gespeichert.")

else:
    best_model_final = joblib.load(MODEL_PATH) 

# Anwendung des Modells auf Zieldaten

In [None]:
# Testdatei finden
test_file = [f for f in files if "test" in f.lower()][0]

# Einlesen der Daten
df_aim = pd.read_csv("/".join([destination, competition_name, test_file]))
print(df_aim.shape)

# Vorhersage
df_aim["IsBadBuy"] = best_model_final.predict(df_aim)

print("\nVerteilung des Targets 'IsBadBuy':\n")
print(df_aim["IsBadBuy"].value_counts(),"\n")
print(df_aim["IsBadBuy"].value_counts(normalize=True),"\n")
display(df_aim.head())

# Model Interpretation

In [None]:
# Feature Importance
final_model = best_model_final.named_steps["model"]
final_preprocessor = best_model_final.named_steps["preprocess"]

feature_names = final_preprocessor.get_feature_names_out()

if hasattr(final_model, "feature_importances_"):
    importances = final_model.feature_importances_
    fi_df = (
        pd.DataFrame({"feature": feature_names, "importance": importances})
        .sort_values("importance", ascending=False)
        .head(20)
    )

    plt.figure(figsize=(8, 6))
    sns.barplot(data=fi_df, x="importance", y="feature", palette = ["#009292", "#074650"])
    plt.title("Top 20 Feature Importances")
    plt.tight_layout()
    plt.show()

    display(fi_df)
else:
    print("Das finale Modell unterstützt keine feature_importances_.")


In [None]:
# PDP für VehicleAge und VehOdo
features_pdp = ["VehicleAge", "VehOdo"]
sample = features_train.sample(5000, random_state=42)

fig, ax = plt.subplots(2, 1, figsize=(16, 10))

PartialDependenceDisplay.from_estimator(
    best_model_final,
    X=sample,
    features=["VehicleAge"],
    ax=ax[0],
    kind="both",
    ice_lines_kw={"color": "#009292", "alpha": 0.15},
    pd_line_kw={"color": "#074650", "linewidth": 2.0},
)

PartialDependenceDisplay.from_estimator(
    best_model_final,
    X=sample,
    features=["VehOdo"],
    ax=ax[1],
    kind="both",
    ice_lines_kw={"color": "#009292", "alpha": 0.15},
    pd_line_kw={"color": "#074650", "linewidth": 2.0},
)

plt.tight_layout()
plt.show()

In [None]:
# PDP Interaktion zwischen VehicleAge und VehOdo
fig, ax = plt.subplots(figsize=(6, 5))

PartialDependenceDisplay.from_estimator(
    best_model_final,
    X=sample,
    features=[("VehicleAge", "VehOdo")],
    ax=ax,
    kind = "average",
    subsample = 1
)

plt.tight_layout()
plt.show()