In [11]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
import pandas as pd
import numpy as np

## Daten einlesen

In [12]:

# Laden des Datensatzes
data_cleaned = "../data/cleaned_train_alt.csv"
data = pd.read_csv(data_cleaned, delimiter=",", encoding="latin", header=0, thousands=",", decimal='.', low_memory=False)

# Zielvariable und Features definieren
X = data.drop('Sales', axis=1)
y = data['Sales']

# Definiere die numerischen und kategorischen Features
numerical_features = ['year', 'month', 'day', 'week_of_year', 'lag_1', 'lag_7']

# Bereits encodierte Features
already_encoded_features = ['Open', 'Promo', 'promo2']

# Noch nicht encodierte kategorische Features
categorical_features_to_encode = ['Store', 'DayOfWeek', 'StoreType', 'StateHoliday','Assortment']


## Split

In [13]:

# 1. Datensatz aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Metrik

In [14]:
# Angepasste RMSPE-Funktion, die Tage mit 0 Sales ignoriert
def rmspe(y_true, y_pred):
    # Nur Fälle berücksichtigen, bei denen y_true nicht 0 ist
    mask = y_true != 0
    y_true_filtered = y_true[mask]
    y_pred_filtered = y_pred[mask]
    
    return np.sqrt(np.mean(((y_true_filtered - y_pred_filtered) / y_true_filtered) ** 2))

# RMSPE als Scorer definieren
rmspe_scorer = make_scorer(rmspe, greater_is_better=False)

## Pre-Processing

In [15]:
# Erstelle den Preprocessor für numerische und kategorische Features (ohne Datumsextraktion)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features + already_encoded_features),  # Skalierung für numerische und bereits encodierte Features
        ('enc', 'passthrough', already_encoded_features),  # Bereits encodierte Features durchschleusen (keine weitere Transformation)
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_to_encode)  # Nur noch nicht encodierte Features encodieren
    ])


## Random Forest mit Pre-Processing Daten

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split

# Erstelle die Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=40, random_state=42))
])

# Optional: Cross-Validation mit RMSPE als Scorer
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring=rmspe_scorer)

print(f"RMSPE CV-Scores: {cv_scores}")
print(f"Mean RMSPE: {-np.mean(cv_scores)}")  # Negative Werte, da greater_is_better=False

RMSPE CV-Scores: [-0.16908628 -0.15583158 -0.30281814 -0.16420855 -0.24732529]
Mean RMSPE: 0.20785396741046566


### Evaluation

In [17]:
# Trainiere das Modell
pipeline.fit(X_train, y_train)

# Vorhersagen auf dem Testdatensatz
y_pred = pipeline.predict(X_test)

# RMSPE auf dem Testdatensatz berechnen
rmspe_score = rmspe(y_test, y_pred)
print(f"RMSPE auf dem Testdatensatz: {rmspe_score}")

RMSPE auf dem Testdatensatz: 0.14426840086304352


In [None]:
estimators = [10, 50, 100, 200, 300, 400, 500]
for e in estimators:
    # Erstelle die Pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators=e, random_state=42))
    ])

    # Optional: Cross-Validation mit RMSPE als Scorer
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring=rmspe_scorer)
    print(f"n_estimators: {e}")
    print(f"RMSPE CV-Scores: {cv_scores}")
    print(f"Mean RMSPE: {-np.mean(cv_scores)}")  # Negative Werte, da greater_is_better=False
    # Trainiere das Modell
    pipeline.fit(X_train, y_train)

    # Vorhersagen auf dem Testdatensatz
    y_pred = pipeline.predict(X_test)

    # RMSPE auf dem Testdatensatz berechnen
    rmspe_score = rmspe(y_test, y_pred)
    print(f"RMSPE auf dem Testdatensatz: {rmspe_score}")

    # Ausgabe der Ergebnisse
    print("")


## Random Forest

In [8]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# RMSPE-Funktion
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Datensatz laden
data = pd.read_csv('/Users/marieernst/Documents/Master/2.Semester/DataMining/Projekt2/data/cleaned_train.csv')

# Datum in datetime umwandeln
data['Date'] = pd.to_datetime(data['Date'])

# Durchschnittlicher Verkauf der letzten 7 Tage
data = data.sort_values(by=['Store', 'Date'])
data['Sales_Lag_7'] = data.groupby('Store')['Sales'].shift(7)
data['Sales_Rolling_7'] = data.groupby('Store')['Sales_Lag_7'].transform(lambda x: x.rolling(7).mean())

# Feature Engineering
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day
data['DayOfWeek'] = data['Date'].dt.dayofweek

# NaN-Werte füllen
data.fillna(0, inplace=True)

# Relevante Features definieren
features = ['Store', 'DayOfWeek', 'Promo', 'Year', 'Month', 'Day', 'Sales_Rolling_7']
X = data[features]
y = data['Sales']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelltraining
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Vorhersagen
predictions = model.predict(X_test)

# Evaluation
error = rmspe(y_test, predictions)
print(f'Root Mean Square Percentage Error (RMSPE): {error}')


Root Mean Square Percentage Error (RMSPE): inf
