In [None]:
# Datenmanipulation
import pandas as pd
import numpy as np

# Visualisierung
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer

# Imbalanced-learn
from imblearn.under_sampling import RandomUnderSampler

# Daten laden
from core.data import clean_data, load_competition_from_kaggle

# Preparation

**Gather Data**

In [None]:
# Daten herunterladen (Kaggle Competition)
competition_name = "DontGetKicked"
destination = "../data/raw"

files = load_competition_from_kaggle(
    competition_name=competition_name,
    destination=destination,
)

# Trainingsdatei finden
train_file = [f for f in files if "training" in f.lower()][0]

# Einlesen der Daten
df = pd.read_csv("/".join([destination, competition_name, train_file]))
print(df.shape)
df.head()

# Explorative Datenanalayse (EDA)

**Understand Data**


In [None]:
# Erste Übersicht über die Daten (Dimensions, Beschreibung, Duplikate)
display(
    "Shape",
    df.shape,
    "Description",
    df.describe().round(2).T,
    "Duplicates",
    df.duplicated().sum(),
)

# Übersicht über die Spalten (Datentypen, fehlende Werte, eindeutige Werte, Beispielwerte)
pd.DataFrame(
    {
        "Data Types": df.dtypes,
        "Missing Values": df.isnull().sum(),
        "Unique Values": df.nunique(),
        "Sample Values": [df[col].sample(3, random_state=42).tolist() for col in df.columns],
    }
)

**Outliers Detection**

In [None]:
# Kategoriale Features: Verteilung nach Zielvariable
categorical_features = [
    "Auction",
    "Transmission",
    "WheelTypeID",
    "WheelType",
    "Nationality",
    "TopThreeAmericanName",
    "PRIMEUNIT",
    "AUCGUART",
    "IsOnlineSale",
]

for categorical_feature in categorical_features:
    # Füllen der fehlenden Werte mit "Missing"
    df_col = df[categorical_feature].fillna("Missing")

    # Kreuztabelle
    print(f"\n=== {categorical_feature} ===")
    print(pd.crosstab(df_col, df["IsBadBuy"]))

    # Visualisierung
    plt.figure(figsize=(16, 6))
    palette = ["#009292", "#074650"]
    sns.countplot(
        x=df_col,
        data=df,
        hue="IsBadBuy",
        stat="proportion",
        order=df_col.value_counts().index,
        palette=palette,
    )
    plt.title(f"Distribution of {categorical_feature}")
    plt.xlabel(None)
    plt.ylabel("Proportion")
    plt.show()


In [None]:
# Hochkardinalen kategoriale Features: Verteilung nach Zielvariable
high_cardinality_categorical_features = [
    "Make", 
    "Color", 
    "Size", 
    "VNST", 
    "Model", 
    "Trim", 
    "SubModel", 
    "BYRNO", 
    "VNZIP1"]
top_n = 10

for categorical_feature in high_cardinality_categorical_features:
    # Füllen der fehlenden Werte mit "Missing"
    df_col = df[categorical_feature].astype("object").fillna("Missing")

    # Top-N-Kategorien behalten, Rest als "Other" zusammenfassen
    top_n_categories = df_col.value_counts().head(top_n).index
    df_col = df_col.where(df_col.isin(top_n_categories), other="Other")

    # Kreuztabelle
    print(pd.crosstab(df_col, df["IsBadBuy"]))

    # Visualisierung
    plt.figure(figsize=(16, 6))
    palette = ["#009292", "#074650"]
    sns.countplot(
        x=df_col,
        data=df,
        hue="IsBadBuy",
        stat="proportion",
        order=df_col.value_counts().index,
        palette=palette,
    )
    plt.title(f"Distribution of {categorical_feature} (Top {top_n} + Other)")
    plt.xlabel(None)
    plt.ylabel("Proportion")
    plt.show()


In [None]:
# Numerische Features: Verteilung nach Zielvariable
numerical_features = [
    "MMRAcquisitionAuctionAveragePrice",
    "MMRAcquisitionAuctionCleanPrice",
    "MMRAcquisitionRetailAveragePrice",
    "MMRAcquisitonRetailCleanPrice",
    "MMRCurrentAuctionAveragePrice",
    "MMRCurrentAuctionCleanPrice",
    "MMRCurrentRetailAveragePrice",
    "MMRCurrentRetailCleanPrice",
    "VehBCost",
    "WarrantyCost",
    "VehOdo",
    "VehicleAge",
]

for numerical_feature in numerical_features:
    fig, axes = plt.subplots(1, 2, figsize=(16, 4))
    palette = ["#009292", "#074650"]

    sns.histplot(
        x=df[numerical_feature], 
        kde=True, ax=axes[0], 
        color=palette[0]
    )
    sns.boxplot(
        data=df, 
        x="IsBadBuy", 
        y=numerical_feature, 
        hue="IsBadBuy", 
        ax=axes[1], 
        palette=palette
    )
    axes[0].set_title(f"Histogram of {numerical_feature}")
    axes[1].set_title(f"Boxplot of {numerical_feature}")
    plt.show()


In [None]:
# Korrelationen
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="viridis", center=0, fmt=".2f")
plt.title("Korrelationsmatrix (numerische Features)")
plt.show()

# Zielverteilung
print("Verteilung des Targets (gesamt):")
print(df["IsBadBuy"].value_counts(normalize=True))

# Split

**Train-Test-Split**

In [None]:
# Train/Test-Split (stratifiziert wegen Klassenungleichgewicht)
target_col = "IsBadBuy"
features = df.drop(columns=target_col)
target = df[target_col]

features_train, features_test, target_train, target_test = train_test_split(
    features,
    target,
    random_state=42,
    test_size=0.1,
    stratify=target
)

# Überprüfung des Train-Test-Splits
print("Dimensionen der Trainingsdaten (Features):", features_train.shape)
print("Dimensionen der Testdaten (Features):", features_test.shape)
print("\nVerteilung des Targets im Trainings-Set:")
print(target_train.value_counts(normalize=True))
print("\nVerteilung des Targets im Test-Set:")
print(target_test.value_counts(normalize=True))

# Data Preparation

**Datatype Transformation**

In [None]:
# Wrapper für Pipeline
def apply_clean(X: pd.DataFrame) -> pd.DataFrame:
    """Wrapper to make clean_data usable in sklearn pipelines."""
    return clean_data(X)

clean_step = FunctionTransformer(apply_clean, validate=False)

In [None]:
# Bereinigung und Datentypkonvertierung der Trainings- undd Testdaten
features_train_clean = apply_clean(features_train)
features_test_clean = apply_clean(features_test)

# Vergleich Datentypkonvertierung vorher und nachher
dtypes_before = features_train.dtypes
dtypes_after = features_train_clean.dtypes
dtype_comparison = pd.DataFrame({"dtypes_before": dtypes_before, "dtypes_after": dtypes_after})
display(dtype_comparison)

**Data Imputation**

In [None]:
# Features trennen in numerische und kateogrische Spalten
numeric_features = features_train_clean.select_dtypes(include=np.number).columns.tolist()
categorical_features = features_train_clean.select_dtypes(include="object").columns.tolist()
datetime_features = features_train_clean.select_dtypes(include="datetime").columns.tolist()

# Auftrennung der kategorischen Spalten
mode_cols = ["Transmission", "IsOnlineSale"]
cat_mode_cols = [col for col in categorical_features if col in mode_cols]
cat_missing_cols = [col for col in categorical_features if col not in mode_cols]

numeric_transformer = SimpleImputer(strategy="median")

categorical_transformer_mode = Pipeline(
    steps=[
        ("imputer_mode", SimpleImputer(strategy="most_frequent")),
        ("encoder_mode", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

categorical_transformer_missing = Pipeline(
    steps=[
        ("imputer_missing", SimpleImputer(strategy="constant", fill_value="Missing")),
        ("encoder_missing", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat_mode", categorical_transformer_mode, cat_mode_cols),
        ("cat_missing", categorical_transformer_missing, cat_missing_cols),
        ("date", "passthrough", datetime_features)
    ],
    remainder="drop",
    verbose_feature_names_out=False
).set_output(transform="pandas")

# Vorverarbeitung der Trainingsdaten und Testdaten mit dem Preprocessor
features_train_preprocessed = preprocessor.fit_transform(features_train_clean)
features_test_preprocessed = preprocessor.transform(features_test_clean)

# Vergleich fehlende Wete vor und nach Imputation
print("Fehlende Werte vor Preprocessing", features_train_clean.isna().sum().sum())
print("Fehlende Werte nach Preprocessing",features_train_preprocessed.isna().sum().sum())

In [None]:
# Resampling des Trainingssets (RandomUnderSampler)
resampler = RandomUnderSampler(random_state=42)

features_train_resampled, target_train_resampled = resampler.fit_resample(features_train_preprocessed, target_train)

print("Verteilung des Targets vor Resampling:")
print(target_train.value_counts(normalize=True))
print("\nVerteilung des Targets nach Resampling:")
print(target_train_resampled.value_counts(normalize=True))