In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# 1. CSV laden
df = pd.read_csv("../data/fuel_combined.csv")

# 2. Zielvariable definieren
y = df["consumption_l_per_100km"]

# 3. Feature-Matrix (ungecodet)
X = df[["brand", "vehicle_class", "fuel_type", "engine_size_l", "cylinders", "year"]]

# 4. One-Hot-Encoding
X_encoded = pd.get_dummies(X, columns=["brand", "vehicle_class", "fuel_type"], drop_first=True)

# 5. Funktion zum Abrufen von Dummy-Spalten
def get_encoded_features(prefix, df):
    return [col for col in df.columns if col.startswith(prefix + "_")]

# 6. Feature-Auswahl wie in Iteration 6
features = ["engine_size_l", "cylinders", "year"] + \
           get_encoded_features("fuel_type", X_encoded) + \
           get_encoded_features("vehicle_class", X_encoded) + \
           get_encoded_features("brand", X_encoded)

# 7. Train/Test-Split mit definierten Features
X_train, X_test, y_train, y_test = train_test_split(X_encoded[features], y, test_size=0.2, random_state=42)

# 8. Modell 1: Lineare Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# 9. Modell 2: Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# 10. Evaluation
print("LINEAR REGRESSION")
print("R²:", r2_score(y_test, y_pred_lr))
y_pred_train_lr = lr.predict(X_train)
print("R² TRAIN (Linear):", r2_score(y_train, y_pred_train_lr))
print("MSE:", mean_squared_error(y_test, y_pred_lr))


print("RANDOM FOREST")
print("R²:", r2_score(y_test, y_pred_rf))
y_pred_train_rf = rf.predict(X_train)
print("R² TRAIN:", r2_score(y_train, y_pred_train_rf))
print("MSE:", mean_squared_error(y_test, y_pred_rf))


# 11. Bestes Modell speichern
best_model = rf
joblib.dump(best_model, "../models/best_model.pkl")
print("Bestes Modell gespeichert: best_model.pkl")



LINEAR REGRESSION
R²: 0.7863592927485754
R² TRAIN (Linear): 0.7941341743801263
MSE: 2.0563568161652617
RANDOM FOREST
R²: 0.9021406753538596
R² TRAIN: 0.951886497203305
MSE: 0.9419257773968925
Bestes Modell gespeichert: best_model.pkl
