In [None]:
# Eigene Funktionen
from core.data import clean_data, engineer_features, load_competition_from_kaggle, memory_data, TopNCategoriesTransformer

# Datenmanipulation
import pandas as pd
import numpy as np

# Visualisierung
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier

# Imbalanced-learn
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

# Warnungen unterdrücken
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action="ignore", category=DataConversionWarning)
warnings.filterwarnings(action="ignore", category=FutureWarning)



# Preparation

**Gather Data**

In [None]:
# Daten herunterladen (Kaggle Competition)
competition_name = "DontGetKicked"
destination = "../data/raw"

files = load_competition_from_kaggle(
    competition_name=competition_name,
    destination=destination,
)

# Trainingsdatei finden
train_file = [f for f in files if "training" in f.lower()][0]

# Einlesen der Daten
df = pd.read_csv("/".join([destination, competition_name, train_file]))
print(df.shape)
df.head()

# Explorative Datenanalayse (EDA)

**Understand Data**


In [None]:
# Erste Übersicht über die Daten (Dimensions, Beschreibung, Duplikate)
display(
    "Shape",
    df.shape,
    "Description",
    df.describe().round(2).T,
    "Duplicates",
    df.duplicated().sum(),
)

# Übersicht über die Spalten (Datentypen, fehlende Werte, eindeutige Werte, Beispielwerte)
pd.DataFrame(
    {
        "Data Types": df.dtypes,
        "Missing Values": df.isnull().sum(),
        "Unique Values": df.nunique(),
        "Sample Values": [df[col].sample(3, random_state=42).tolist() for col in df.columns],
    }
)

**Outliers Detection**

In [None]:
# Kategoriale Features: Verteilung nach Zielvariable
categorical_features = [
    "Auction",
    "Transmission",
    "WheelTypeID",
    "WheelType",
    "Nationality",
    "TopThreeAmericanName",
    "PRIMEUNIT",
    "AUCGUART",
    "IsOnlineSale",
]

for categorical_feature in categorical_features:
    # Füllen der fehlenden Werte mit "Missing"
    df_col = df[categorical_feature].fillna("Missing")

    # Kreuztabelle
    print(f"\n=== {categorical_feature} ===")
    print(pd.crosstab(df_col, df["IsBadBuy"]))

    # Visualisierung
    plt.figure(figsize=(16, 6))
    palette = ["#009292", "#074650"]
    sns.countplot(
        x=df_col,
        data=df,
        hue="IsBadBuy",
        stat="proportion",
        order=df_col.value_counts().index,
        palette=palette,
    )
    plt.title(f"Distribution of {categorical_feature}")
    plt.xlabel(None)
    plt.ylabel("Proportion")
    plt.show()


In [None]:
# Hochkardinalen kategoriale Features: Verteilung nach Zielvariable
high_cardinality_categorical_features = [
    "Make", 
    "Color", 
    "Size", 
    "VNST", 
    "Model", 
    "Trim", 
    "SubModel", 
    "BYRNO", 
    "VNZIP1"]
top_n = 10

for categorical_feature in high_cardinality_categorical_features:
    # Füllen der fehlenden Werte mit "Missing"
    df_col = df[categorical_feature].astype("str").fillna("Missing")

    # Top-N-Kategorien behalten, Rest als "Other" zusammenfassen
    top_n_categories = df_col.value_counts().head(top_n).index
    df_col = df_col.where(df_col.isin(top_n_categories), other="Other")

    # Kreuztabelle
    print(pd.crosstab(df_col, df["IsBadBuy"]))

    # Visualisierung
    plt.figure(figsize=(16, 6))
    palette = ["#009292", "#074650"]
    sns.countplot(
        x=df_col,
        data=df,
        hue="IsBadBuy",
        stat="proportion",
        order=df_col.value_counts().index,
        palette=palette,
    )
    plt.title(f"Distribution of {categorical_feature} (Top {top_n} + Other)")
    plt.xlabel(None)
    plt.ylabel("Proportion")
    plt.show()


In [None]:
# Numerische Features: Verteilung nach Zielvariable
numerical_features = [
    "MMRAcquisitionAuctionAveragePrice",
    "MMRAcquisitionAuctionCleanPrice",
    "MMRAcquisitionRetailAveragePrice",
    "MMRAcquisitonRetailCleanPrice",
    "MMRCurrentAuctionAveragePrice",
    "MMRCurrentAuctionCleanPrice",
    "MMRCurrentRetailAveragePrice",
    "MMRCurrentRetailCleanPrice",
    "VehBCost",
    "WarrantyCost",
    "VehOdo",
    "VehicleAge",
]

for numerical_feature in numerical_features:
    fig, axes = plt.subplots(1, 2, figsize=(16, 4))
    palette = ["#009292", "#074650"]

    sns.histplot(
        x=df[numerical_feature], 
        kde=True, ax=axes[0], 
        color=palette[0]
    )
    sns.boxplot(
        data=df, 
        x="IsBadBuy", 
        y=numerical_feature, 
        hue="IsBadBuy", 
        ax=axes[1], 
        palette=palette
    )
    axes[0].set_title(f"Histogram of {numerical_feature}")
    axes[1].set_title(f"Boxplot of {numerical_feature}")
    plt.show()


In [None]:
# Korrelationen
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="viridis", center=0, fmt=".2f")
plt.title("Korrelationsmatrix (numerische Features)")
plt.show()

# Zielverteilung
print("Verteilung des Targets (gesamt):")
print(df["IsBadBuy"].value_counts(normalize=True))

# Split

**Train-Test-Split**

In [None]:
# Train/Test-Split (stratifiziert wegen Klassenungleichgewicht)
target_col = "IsBadBuy"
features = df.drop(columns=target_col)
target = df[target_col]

features_train, features_test, target_train, target_test = train_test_split(
    features,
    target,
    random_state=42,
    test_size=0.1,
    stratify=target
)

# Überprüfung des Train-Test-Splits
print("Dimensionen der Trainingsdaten (Features):", features_train.shape)
print("Dimensionen der Testdaten (Features):", features_test.shape)
print("\nVerteilung des Targets im Trainings-Set:")
print(target_train.value_counts(normalize=True))
print("\nVerteilung des Targets im Test-Set:")
print(target_test.value_counts(normalize=True))

# Data Preparation

**Datatype Transformation**

In [None]:
# Clean data für die Modell-Pipeline vorbereiten
def apply_clean(X: pd.DataFrame) -> pd.DataFrame:
    """Wrapper to make clean_data usable in sklearn pipelines."""
    return clean_data(X)

clean_step = FunctionTransformer(apply_clean, validate=False)

# Bereinigung und Datentypkonvertierung der Trainings- undd Testdaten
features_train_clean = apply_clean(features_train)

# Vergleich Datentypkonvertierung vorher und nachher
dtypes_before = features_train.dtypes
dtypes_after = features_train_clean.dtypes
dtype_comparison = pd.DataFrame({"dtypes_before": dtypes_before, "dtypes_after": dtypes_after})
display(dtype_comparison)

**Data Imputation**

In [None]:
# Feature Engineering auf bereinigte Daten anwenden
features_train_engineered = engineer_features(features_train_clean.copy())

# Features trennen in numerische und kategoriale Spalten
numeric_features = features_train_engineered.select_dtypes(include=np.number).columns.tolist()
categorical_features = features_train_engineered.select_dtypes(include="object").columns.tolist()

# Kategorische Feature-Gruppen definieren
high_cardinality_categorical_features = [
    "Make",
    "Color",
    "Size",
    "VNST",
    "Model",
    "Trim",
    "SubModel",
    "BYRNO",
    "VNZIP1",
]

high_card_features = [
    col for col in high_cardinality_categorical_features if col in categorical_features
]
low_card_features = [col for col in categorical_features if col not in high_card_features]

mode_features = ["Transmission", "IsOnlineSale"]
low_card_mode_features = [col for col in low_card_features if col in mode_features]
low_card_missing_features = [
    col for col in low_card_features if col not in low_card_mode_features
]

# Imputation testen (ohne Encoding), um fehlende Werte zu kontrollieren
imputer = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_features),
        ("cat_mode", SimpleImputer(strategy="most_frequent"), low_card_mode_features),
        (
            "cat_missing",
            SimpleImputer(strategy="constant", fill_value="Missing"),
            low_card_missing_features + high_card_features,
        ),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

features_train_imputed = imputer.fit_transform(features_train_engineered)

# Vergleich fehlende Werte vor und nach der Imputation
missing_values_before = features_train_engineered.isna().sum()
missing_values_after = features_train_imputed.isna().sum()
missing_values_comparison = pd.DataFrame(
    {
        "missing_values_before": missing_values_before,
        "missing_values_after": missing_values_after,
    }
)
display(missing_values_comparison)

# Modeling

**Feature Engineering**

In [None]:
# Feature Engineering für die Modell-Pipeline vorbereiten
def apply_feature_engineering(X: pd.DataFrame) -> pd.DataFrame:
    """Wrapper to make engineer_features usable in sklearn pipelines."""
    return engineer_features(X)

feature_engineering_step = FunctionTransformer(apply_feature_engineering, validate=False)

**Data Scaling + Dimensonality Reduction**

In [None]:
# PCA-Features definieren und restliche numerische Features bestimmen
pca_cols = [
    "MMRAcquisitionAuctionAveragePrice",
    "MMRAcquisitionAuctionCleanPrice",
    "MMRAcquisitionRetailAveragePrice",
    "MMRAcquisitonRetailCleanPrice",
    "MMRCurrentAuctionAveragePrice",
    "MMRCurrentAuctionCleanPrice",
    "MMRCurrentRetailAveragePrice",
    "MMRCurrentRetailCleanPrice",
]

pca__numeric_features = [col for col in pca_cols if col in numeric_features]
non_pca_numeric_features = [col for col in numeric_features if col not in pca__numeric_features]

**OHE Encoding**

In [None]:
# Kategorische Feature-Gruppen definieren
high_cardinality_categorical_features = [
    "Make",
    "Color",
    "Size",
    "VNST",
    "Model",
    "Trim",
    "SubModel",
    "BYRNO",
    "VNZIP1",
]

high_card_features = [
    col for col in high_cardinality_categorical_features if col in categorical_features
]
low_card_features = [col for col in categorical_features if col not in high_card_features]

mode_features = ["Transmission", "IsOnlineSale"]
low_card_mode_features = [col for col in low_card_features if col in mode_features]
low_card_missing_features = [
    col for col in low_card_features if col not in low_card_mode_features
]

**Arbeitsspeicher Optimierung**

In [None]:
# Arbeitsspeicher Optimierung für die Modell-Pipeline vorbereiten
def apply_memory_data(X: pd.DataFrame) -> pd.DataFrame:
    """Wrapper to make memory_data usable in sklearn pipelines."""
    return memory_data(X)

memory_step = FunctionTransformer(apply_memory_data, validate=False)

# Modeling

**Baseline Model**

In [None]:
# Preprocessing- und Modell-Pipeline (inkl. PCA und Resampling)
numeric_transformer_non_pca = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

numeric_transformer_pca = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=0.95, random_state=42)),
    ]
)

categorical_transformer_low_card_mode = Pipeline(
    steps=[
        ("imputer_mode", SimpleImputer(strategy="most_frequent")),
        ("encoder_mode", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

categorical_transformer_low_card_missing = Pipeline(
    steps=[
        ("imputer_missing", SimpleImputer(strategy="constant", fill_value="Missing")),
        ("encoder_missing", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

categorical_transformer_high_card_missing = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("top_n", TopNCategoriesTransformer(top_n=19)),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer_non_pca, non_pca_numeric_features),
        ("pca", numeric_transformer_pca, pca__numeric_features),
        ("cat_high_card", categorical_transformer_high_card_missing, high_card_features),
        ("cat_mode", categorical_transformer_low_card_mode, low_card_mode_features),
        ("cat_missing", categorical_transformer_low_card_missing, low_card_missing_features),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
).set_output(transform="pandas")

model_pipeline = ImbPipeline(
    steps=[
        ("clean", clean_step),
        ("feature_engineering", feature_engineering_step),
        ("preprocess", preprocessor),
        ("memory", memory_step),
        ("sampler", RandomUnderSampler(random_state=42)),
        ("model", RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)),
    ]
)

model_pipeline

In [None]:
# Mehrere Modelle testen
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "KNeighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(random_state=42)
}

results = []

for model_name, model in models.items():
    model_pipeline = ImbPipeline(
        steps=[
            ("clean", clean_step),
            ("feature_engineering", feature_engineering_step),
            ("preprocess", preprocessor),
            ("sampler", RandomUnderSampler(random_state=42)),
            ("model", model),
        ]
    )

    model_pipeline.fit(features_train, target_train)
    target_test_pred = model_pipeline.predict(features_test)
    target_test_proba = model_pipeline.predict_proba(features_test)[:, 1] if hasattr(model_pipeline, "predict_proba") else None

    roc_auc = roc_auc_score(target_test, target_test_proba) if target_test_proba is not None else np.nan

    print(f"\n=== {model_name} ===")
    print(classification_report(target_test, target_test_pred))
    if target_test_proba is not None:
        print("ROC-AUC:", roc_auc)

    results.append({
        "Model": model_name,
        "ROC-AUC": roc_auc,
        "Accuracy": (target_test == target_test_pred).mean(),
    })

results_df = pd.DataFrame(results)
display(results_df)

# Model Interpretation