In [None]:
# Laden der Bibliotheken

# Datenmanipulation
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Imbalanced-learn
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Vorverarbeitungsfunktionen
from core.data import clean_data

# Split


**Train-Test-Split**


In [None]:
# Einlesen der bereinigten Daten
destination_path = "../data/processed"
df = pd.read_csv(f"{destination_path}/eda_data.csv")

In [None]:
# Aufteilung in Traings- und Testdaten
target_col = ["IsBadBuy"]
features = df.drop(columns=target_col)
target = df[target_col]

# Durchführung des Train-Test-Splits
features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            target, 
                                                                            random_state=42,
                                                                            test_size=0.1,
                                                                            stratify=target)

In [None]:
# Überprüfung des Train-Test-Splits
print("Dimensionen der Trainingsdaten (Features):", features_train.shape)
print("Dimensionen der Testdaten (Features):", features_test.shape)
print("\nVerteilung des Targets im Trainings-Set:")
print(target_train.value_counts(normalize=True))
print("\nVerteilung des Targets im Test-Set:")
print(target_test.value_counts(normalize=True))

# Data Preparation


**Datatype Transformation**


In [None]:
# Bereinigung und Datentypkonvertierung der Trainings- undd Testdaten
features_train_clean = clean_data(features_train)
features_test_clean = clean_data(features_test)

# Vergleich Datentypkonvertierung vorher und nachher
dtypes_before = features_train.dtypes
dtypes_after = features_train_clean.dtypes
dtype_comparison = pd.DataFrame({"dtypes_before": dtypes_before, "dtypes_after": dtypes_after})
display(dtype_comparison)

**Data Imputation**


In [None]:
# Features trennen in numerische und kateogrische Spalten
numerical_features = features_train_clean.select_dtypes(include=np.number).columns.tolist()
categorical_features = features_train_clean.select_dtypes(include="object").columns.tolist()
datetime_features = features_train_clean.select_dtypes(include="datetime").columns.tolist()

# Auftrennung der kategorischen Spalten
mode_cols = ["Transmission", "IsOnlineSale"]
cat_unknown_cols = [col for col in categorical_features if col not in mode_cols]
cat_mode_cols = [col for col in categorical_features if col in mode_cols]

preprocessor_impute = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numerical_features),
        ("cat_unknown", SimpleImputer(strategy="constant", fill_value="Unknown"), cat_unknown_cols),
        ("cat_mode", SimpleImputer(strategy="most_frequent"), cat_mode_cols),
        ("date", "passthrough", datetime_features)
    ],
    remainder="drop",
    verbose_feature_names_out=False
)
preprocessor_impute.set_output(transform="pandas")

features_train_impute = preprocessor_impute.fit_transform(features_train_clean)
features_test_impute = preprocessor_impute.transform(features_test_clean)

# Vergleich fehlende Wete vor und nach Imputation
missing_before = features_train_clean.isna().sum()
missing_after = features_train_impute.isna().sum()
missing_comparison = pd.DataFrame({"missing_before": missing_before, "missing_after": missing_after})
display(missing_comparison)

**Deal with Outliers**


In [None]:
# Numerische Werte scheinen plausibel, verzichte auf Behandlung von Ausreißer

**Resample**

In [None]:
# Sampler instanziieren
resampler = RandomUnderSampler(random_state=42)

features_train_resampled, target_train_resampled = resampler.fit_resample(features_train_impute, target_train)

print("Verteilung des Targets im Trainings-Set:")
print(target_train.value_counts(normalize=True), "\n")
print("Verteilung des Targets im Test-Set:")
print(target_train_resampled.value_counts(normalize=True))

In [None]:
# Speichern der Trainings- und Testdaten
features_train.to_csv(f"{destination_path}/features_train.csv", index=False)
features_test.to_csv(f"{destination_path}/features_test.csv", index=False)
target_train.to_csv(f"{destination_path}/target_train.csv", index=False)
target_test.to_csv(f"{destination_path}/target_test.csv", index=False)