In [4]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
import pandas as pd
import numpy as np

## Daten einlesen

In [4]:

# Laden des Datensatzes
data_cleaned = "../data/cleaned_train.csv"
data = pd.read_csv(data_cleaned, delimiter=",", encoding="latin", header=0, thousands=",", decimal='.', low_memory=False)

# Zielvariable und Features definieren
X = data.drop('Sales', axis=1)
y = data['Sales']

# Definiere die numerischen und kategorischen Features
numerical_features = ['year', 'month', 'day', 'week_of_year', 'lag_1', 'lag_7']

# Bereits encodierte Features
already_encoded_features = ['Open', 'Promo', 'promo2']

# Noch nicht encodierte kategorische Features
categorical_features_to_encode = ['Store', 'DayOfWeek', 'StoreType', 'StateHoliday','Assortment']


## Split

In [5]:

# 1. Datensatz aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Speichern der Trainings- und Testdaten als CSV
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)


## Metrik

In [7]:

# Angepasste RMSPE-Funktion, die Tage mit 0 Sales ignoriert
def rmspe(y_true, y_pred):
    # Nur Fälle berücksichtigen, bei denen y_true nicht 0 ist
    mask = y_true != 0
    y_true_filtered = y_true[mask]
    y_pred_filtered = y_pred[mask]
    
    return np.sqrt(np.mean(((y_true_filtered - y_pred_filtered) / y_true_filtered) ** 2))

In [8]:
# RMSPE als Scorer definieren
rmspe_scorer = make_scorer(rmspe, greater_is_better=False)

## Pre-processing

In [9]:

# Erstelle den Preprocessor für numerische und kategorische Features (ohne Datumsextraktion)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features + already_encoded_features),  # Skalierung für numerische und bereits encodierte Features
        ('enc', 'passthrough', already_encoded_features),  # Bereits encodierte Features durchschleusen (keine weitere Transformation)
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_to_encode)  # Nur noch nicht encodierte Features encodieren
    ])


## Lineare Regression

In [10]:

# Erstelle die Pipeline
pipeline_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
pipeline_lr.fit(X_train, y_train)


In [11]:
# 5-fold Cross-Validation Setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# Cross-Validation durchführen
cv_scores = cross_val_score(pipeline_lr, X_train, y_train, cv=kf, scoring=rmspe_scorer, verbose=True)

In [12]:
mean_rmspe = np.mean(cv_scores)
mean_rmspe

-0.31080392127400464

## Random Forest

In [None]:
# Erstelle die Pipeline
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])
pipeline_rf.fit(X_train, y_train)

In [None]:
cv_scores = cross_val_score(pipeline_rf, X_train, y_train, cv=kf, scoring=rmspe_scorer, verbose=True)

NameError: name 'cross_val_score' is not defined

In [None]:
mean_rmspe = np.mean(cv_scores)
mean_rmspe

## Evaluation auf den Testdaten

In [None]:
# 3. Vorhersagen auf dem Testset machen
y_pred = pipeline_lr.predict(X_test)

test_rmspe = rmspe(y_test, y_pred)
test_rmspe

0.2547090768508728

In [8]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
import numpy as np

# Angepasste RMSPE-Funktion, die Tage mit 0 Sales ignoriert
def rmspe(y_true, y_pred):
    mask = y_true != 0
    y_true_filtered = y_true[mask]
    y_pred_filtered = y_pred[mask]
    return np.sqrt(np.mean(((y_true_filtered - y_pred_filtered) / y_true_filtered) ** 2))

# RMSPE als Scorer definieren
rmspe_scorer = make_scorer(rmspe, greater_is_better=False)

# Preprocessor für numerische und kategorische Features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Skalierung für numerische Features
        ('enc', 'passthrough', already_encoded_features),  # Bereits encodierte Features durchschleusen (keine Transformation)
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_to_encode)  # Nicht encodierte Features encodieren
    ])

# Erstelle die Pipeline für das KNN-Modell
pipeline_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', KNeighborsRegressor(n_neighbors=5))  # KNN mit k=5
])

# Fitting des Modells
pipeline_knn.fit(X_train, y_train)

# 5-fold Cross-Validation Setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Cross-Validation durchführen
cv_scores_knn = cross_val_score(pipeline_knn, X_train, y_train, cv=kf, scoring=rmspe_scorer, verbose=True)

# Mittelwert der RMSPE
mean_rmspe_knn = np.mean(cv_scores_knn)
print(f'Mean RMSPE for KNN: {mean_rmspe_knn}')