In [24]:
# 02_features.ipynb – Feature Engineering

In [25]:
# 1. Bibliotheken
import pandas as pd
import numpy as np

In [26]:
# 2. Daten einlesen
df = pd.read_csv("../data/model_input.csv")
print(df.shape)


(6274, 12)


In [27]:
# 3. Fahrzeug-Segment-Mapping - # Mapping basierend auf Wikipedia: https://de.wikipedia.org/wiki/Fahrzeugklasse
# Lese externes Mapping aus vehicle_segment.csv
mapping_df = pd.read_csv("../data/vehicle_segment.csv")  # ggf. relativen Pfad anpassen
segment_map = dict(zip(mapping_df['Vehicle Class'], mapping_df['Segment']))

# Wende Mapping an
df['Vehicle Segment'] = df['Vehicle Class'].map(segment_map)

# Prüfen ob Mapping vollständig war
unmapped = df['Vehicle Segment'].isnull().sum()
print(f"Unzugeordnete Fahrzeugklassen: {unmapped}")


Unzugeordnete Fahrzeugklassen: 0


In [28]:
#4. Feature Engineering Verhältnis Verbrauch Stadt / Autobahn
df['consumption_ratio'] = df['Fuel Consumption City (L/100 km)'] / df['Fuel Consumption Hwy (L/100 km)']


# Anzahl Gänge aus Getriebe extrahieren (z. B. 'A6', 'M5', 'AV')
df['gear_count'] = df['Transmission'].str.extract(r'(\d{1,2})').astype(float)

print(df['gear_count'].value_counts())
print(df['consumption_ratio'].describe())



gear_count
6.0     2784
8.0     1513
7.0      880
9.0      329
5.0      276
10.0     188
4.0       63
Name: count, dtype: int64
count    6274.000000
mean        1.382992
std         0.125396
min         0.857143
25%         1.315068
50%         1.384615
75%         1.456790
max         1.821053
Name: consumption_ratio, dtype: float64


In [29]:
# OneHot-Encoding der neuen Fahrzeugsegmente
df = pd.get_dummies(df, columns=['Vehicle Segment'], drop_first=False)
print(df.filter(like='Vehicle Segment_').columns)



Index(['Vehicle Segment_Kleinstwagen', 'Vehicle Segment_Kleinwagen',
       'Vehicle Segment_Kompaktklasse', 'Vehicle Segment_Kompaktklasse Kombi',
       'Vehicle Segment_Mittelklasse', 'Vehicle Segment_Mittelklasse Kombi',
       'Vehicle Segment_Nutzfahrzeug', 'Vehicle Segment_Nutzfahrzeug leicht',
       'Vehicle Segment_Oberklasse', 'Vehicle Segment_SUV gross',
       'Vehicle Segment_SUV kompakt', 'Vehicle Segment_Spezialfahrzeug',
       'Vehicle Segment_Sportwagen', 'Vehicle Segment_Van / MPV'],
      dtype='object')


In [30]:
# 5. OneHot-Encoding für Fuel Type
df = pd.get_dummies(df, columns=['Fuel Type'], drop_first=True)
print(df.filter(like='Fuel Type_').columns)


Index(['Fuel Type_E', 'Fuel Type_N', 'Fuel Type_X', 'Fuel Type_Z'], dtype='object')


In [31]:
# 6. Binär-Feature: is_automatic aus Transmission
df['is_automatic'] = df['Transmission'].apply(lambda x: 1 if 'A' in x else 0)
# Ergänzung: is_manual ergänzen, damit spätere Modellfeatures vollständig sind
if 'is_manual' not in df.columns:
    df['is_manual'] = 1 - df['is_automatic']

In [32]:
# 7. Ziel- und Prädiktoren-Spalten vorbereiten
target = 'Fuel Consumption Comb (L/100 km)'
features = [
    'Engine Size(L)',
    'Cylinders',
    'is_automatic',
    'is_manual',
    'gear_count',
    'consumption_ratio'
] + [col for col in df.columns if col.startswith('Vehicle Segment_') or col.startswith('Fuel Type_')]

In [33]:
# 8. Ausgabe
print("Finale Features für das Modell:")
print(features)
print("\nShape (X):", df[features].shape, "| Shape (y):", df[target].shape)


Finale Features für das Modell:
['Engine Size(L)', 'Cylinders', 'is_automatic', 'is_manual', 'gear_count', 'consumption_ratio', 'Vehicle Segment_Kleinstwagen', 'Vehicle Segment_Kleinwagen', 'Vehicle Segment_Kompaktklasse', 'Vehicle Segment_Kompaktklasse Kombi', 'Vehicle Segment_Mittelklasse', 'Vehicle Segment_Mittelklasse Kombi', 'Vehicle Segment_Nutzfahrzeug', 'Vehicle Segment_Nutzfahrzeug leicht', 'Vehicle Segment_Oberklasse', 'Vehicle Segment_SUV gross', 'Vehicle Segment_SUV kompakt', 'Vehicle Segment_Spezialfahrzeug', 'Vehicle Segment_Sportwagen', 'Vehicle Segment_Van / MPV', 'Fuel Type_E', 'Fuel Type_N', 'Fuel Type_X', 'Fuel Type_Z']

Shape (X): (6274, 24) | Shape (y): (6274,)


In [34]:
# Finale Features und Zielvariable speichern
X = df[features]
y = df[target]
X.to_csv('../data/X_features.csv', index=False)
y.to_csv('../data/y_target.csv', index=False)
