In [None]:
# Laden der Bibliotheken

# Datenmanipulation
import pandas as pd

# Datenvisualisierung
import seaborn as sns
import matplotlib.pyplot as plt

# Daten laden
from core.data import load_competition_from_kaggle

In [None]:
# Dataset laden
competition_name = "DontGetKicked"  # Kaggle Competition Name
destination = "../data/raw"

files = load_competition_from_kaggle(
    competition_name=competition_name, 
    destination=destination,
    )

files

In [None]:
# Einblick in das Datenset
df_train = pd.read_csv("/".join(["../data/raw/", competition_name, files[4]]))
df_train.head()

In [None]:
# Dimensions, Beschreibung, Duplikate
display(
    "Shape",
    df_train.shape,
    "Description",
    df_train.describe().round(2).T,
    "Duplicates",
    df_train.duplicated().sum(),
)

# Datentypen, fehlende Werte, eindeutige Werte, Beispielwerte
pd.DataFrame(
    {
        "Data Types": df_train.dtypes,
        "Missing Values": df_train.isnull().sum(),
        "Unique Values": df_train.nunique(),
        "Sample Values": [df_train[col].sample(3).tolist() for col in df_train.columns]
    })

In [None]:
# Verteilung der kategorischen Features abhängig von der Zielvariable
categorical_features = ["Auction", "Transmission", "WheelTypeID", "WheelType", "Nationality", "TopThreeAmericanName", "PRIMEUNIT", "AUCGUART", "IsOnlineSale"]
for categorical_feature in categorical_features:
    # Füllen der fehlenden Werte mit "Missing"
    df_col = df_train[categorical_feature].fillna("Missing")
    
    # Kreuztabelle
    print(pd.crosstab(df_col, df_train["IsBadBuy"]))
    
    # Visualisierung
    plt.figure(figsize=(16, 6))
    palette = ["#009292", "#074650"]
    sns.countplot(x=df_col, data=df_train, hue="IsBadBuy", stat="proportion", order = df_col.value_counts().index, palette=palette)
    plt.title(f"Distribution of {categorical_feature}")
    plt.xlabel(None)
    plt.ylabel("Proportion")
    plt.show()

In [None]:
# Verteilung der hochkardinalen kategorischen Features abhängig von der Zielvariable
high_cardinality_categorical_features = ["Make", "Color", "Size", "VNST", "Model", "Trim", "SubModel", "BYRNO", "VNZIP1"]
top_n = 10
df_train["BYRNO"] = df_train["BYRNO"].astype(str)
df_train["VNZIP1"] = df_train["VNZIP1"].astype(str)

for categorical_feature in high_cardinality_categorical_features:
    # Füllen der fehlenden Werte mit "Missing"
    df_col = df_train[categorical_feature].fillna("Missing")
    top_n_categories = df_col.value_counts().head(top_n).index
    df_col = df_col[df_col.isin(top_n_categories)]

    # Kreuztabelle
    print(pd.crosstab(df_col, df_train["IsBadBuy"]))
    
    # Visualisierung
    plt.figure(figsize=(16, 6))
    palette = ["#009292", "#074650"]
    sns.countplot(x=df_col, data=df_train, hue="IsBadBuy", stat="proportion", order = df_col.value_counts().index, palette=palette)
    plt.title(f"Distribution of {categorical_feature}")
    plt.xlabel(None)
    plt.ylabel("Proportion")
    plt.show()

In [None]:
# Verteilung der numerischen Features abhängig von der Zielvariable
numerical_features = [  'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice','MMRAcquisitionRetailAveragePrice', 
                        'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
                        'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'VehBCost', 'WarrantyCost', "VehOdo", "VehicleAge", "IsBadBuy"]
for numerical_feature in numerical_features[:-1]:
    # Visualisierung
    fig, axes = plt.subplots(1, 2, figsize=(16, 4))
    palette = ["#009292", "#074650"]
    sns.histplot(x=df_train[numerical_feature], kde=True, ax=axes[0], color=palette[0])
    sns.boxplot(data=df_train, x="IsBadBuy", y=numerical_feature, hue="IsBadBuy", ax=axes[1], palette=palette)
    axes[0].set_title(f"Histogram of {numerical_feature}")
    axes[1].set_title(f"Boxplot of {numerical_feature}")
    plt.show()

In [None]:
# Korrelationen zwischen numerischen Features
plt.figure(figsize=(12, 10))
sns.heatmap(df_train.corr(numeric_only=True), annot=True, cmap="viridis", center=0, fmt=".2f");

In [None]:
# Verteilung des Datensets
print(df_train["IsBadBuy"].value_counts(normalize=True))

In [None]:
# Speichern des Dataframes
destination_path = "../data/processed"
df_train.to_csv(f"{destination_path}/eda_data.csv", index=False)