In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from itertools import product
import numpy as np

In [2]:
train_indices = np.load('train_combinations.npy', allow_pickle=True)
val_indices = np.load('val_combinations.npy', allow_pickle=True)
test_indices = np.load('test_combinations.npy', allow_pickle=True)

**No Shift**

In [3]:
datei_pfad = '../data/no_shift_forecasting.csv'
df_no = pd.read_csv(datei_pfad)

In [4]:
# Erstellen der Test-, Train- und Validation-Sets durch Filtern
train_df_no = df_no[df_no['filiale_baeckerei'].isin(train_indices)]
val_df_no = df_no[df_no['filiale_baeckerei'].isin(val_indices)]
test_df_no = df_no[df_no['filiale_baeckerei'].isin(test_indices)]


In [5]:
print("Length of each Set:")
print("train set:", len(train_df_no))
print("val set:", len(val_df_no))
print("test set:", len(test_df_no))

Length of each Set:
train set: 35375
val set: 7860
test set: 7790


In [6]:
# Relevante Spalten auswählen
feature_columns = ['revenue', 'revenue-1', 'revenue-2', 'revenue-3', 'revenuePY', 'revenuePY-1', 'revenuePY-2', 'revenuePY-3',
                   'revenuePY+2', 'revenuePY+1', 'carnival', 'easter', 'ascension_day', 'whitsunday', 'christmas', 'new_year']
target_column = 'revenue+2'

# Training-Features und -Zielvariable extrahieren
X_train_no = train_df_no[feature_columns].values
y_train_no = train_df_no[target_column].values

# Validierungs-Features und -Zielvariable extrahieren
X_val_no = val_df_no[feature_columns].values
y_val_no = val_df_no[target_column].values

# Test-Features und -Zielvariable extrahieren
X_test_no = test_df_no[feature_columns].values
y_test_no = test_df_no[target_column].values

# Define the hyperparameter grid manually
param_grid = {
    'n_estimators': [100, 300, 500, 1000],  # Anzahl der Bäume im Wald
    'max_features': ['sqrt', 'log2', None],
    'min_samples_leaf': [5, 10, 20],
    'max_depth': [None, 10, 20, 30],
    'max_samples': [0.6, 0.8, 1.0]
}

# Create all combinations of hyperparameters
combinations = list(product(param_grid['n_estimators'], 
                            param_grid['max_features'], 
                            param_grid['min_samples_leaf'], 
                            param_grid['max_depth'], 
                            param_grid['max_samples']))

# Track the best model
best_model_no = None
best_mre_no = float('inf')  # Set to a large value to minimize it
best_params_no = {}

# Progress counter
total_combinations = len(combinations)

# Manual loop for hyperparameter search
for i, (n_estimators, max_features, min_samples_leaf, max_depth, max_samples) in enumerate(combinations):
    print(f"Testing combination {i + 1}/{total_combinations}: n_estimators={n_estimators}, max_features={max_features}, "
          f"min_samples_leaf={min_samples_leaf}, max_depth={max_depth}, max_samples={max_samples}")
    
    # Create and train the model on the train set
    model_no = RandomForestRegressor(
        n_estimators=n_estimators,  # Anzahl der Bäume im Wald
        max_features=max_features,
        min_samples_leaf=min_samples_leaf,
        max_depth=max_depth,
        max_samples=max_samples,
        random_state=42,
        n_jobs=-1
    )
    
    # Train the model on the train set
    model_no.fit(X_train_no, y_train_no)
    
    # Validate the model on the validation set
    y_val_pred_no = model_no.predict(X_val_no)
    
    # Calculate the Mean Relative Error (MRE) on the validation set
    relative_errors_val_no = np.abs((y_val_no - y_val_pred_no) / y_val_no)
    val_mre_no = np.mean(relative_errors_val_no)
    
    print(f"Validation MRE: {val_mre_no}")
    
    # Check if this model is the best so far based on MRE
    if val_mre_no < best_mre_no:
        best_mre_no = val_mre_no
        best_model_no = model_no
        best_params_no = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'min_samples_leaf': min_samples_leaf,
            'max_depth': max_depth,
            'max_samples': max_samples
        }

    print(f"Current best MRE: {best_mre_no}")

# Output the best hyperparameters after validation
print(f"\nBest hyperparameters after validation: {best_params_no}")
print(f"Best validation MRE: {best_mre_no}")

# Now evaluate the best model on the test set
y_test_pred_no = best_model_no.predict(X_test_no)

# Calculate the Mean Relative Error on the test set
relative_errors_test_no = np.abs((y_test_no - y_test_pred_no) / y_test_no)
mean_relative_error_test_no = np.mean(relative_errors_test_no)

print(f"\nMean Relative Error on the test set: {mean_relative_error_test_no}")

Testing combination 1/432: n_estimators=100, max_features=sqrt, min_samples_leaf=5, max_depth=None, max_samples=0.6
Validation MRE: 0.06817715415107864
Current best MRE: 0.06817715415107864
Testing combination 2/432: n_estimators=100, max_features=sqrt, min_samples_leaf=5, max_depth=None, max_samples=0.8
Validation MRE: 0.06734801381324591
Current best MRE: 0.06734801381324591
Testing combination 3/432: n_estimators=100, max_features=sqrt, min_samples_leaf=5, max_depth=None, max_samples=1.0


KeyboardInterrupt: 

In [None]:
print(f"Best Parameter No: {best_params_no}")
print(f"\nBest validation MRE No: {best_mre_no}")
print(f"\nMean Relative Error on the test set No: {mean_relative_error_test_no}")
print(f"\nMean Relative Error on the train set No: {mean_relative_error_train_no}")

In [None]:
importances = best_model_no.feature_importances_

# Optional: Anzeige mit den Namen der Features
feature_names = feature_columns  # Wenn X_train ein DataFrame ist, enthält dies die Spaltennamen
import pandas as pd

# Erstelle ein DataFrame, um die Feature-Bedeutung zusammen mit den Namen der Variablen anzuzeigen
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sortieren nach der Wichtigkeit, um die wichtigsten Features oben anzuzeigen
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Ausgabe anzeigen
print(feature_importance_df)

**Public Shift**

In [None]:
datei_pfad = '../data/public_shift_forecasting.csv'
df_public = pd.read_csv(datei_pfad)

In [None]:
# Erstellen der Test-, Train- und Validation-Sets durch Filtern
train_df_public = df_public[df_public['filiale_baeckerei'].isin(train_indices)]
val_df_public = df_public[df_public['filiale_baeckerei'].isin(val_indices)]
test_df_public = df_public[df_public['filiale_baeckerei'].isin(test_indices)]


In [None]:
# Relevante Spalten auswählen
feature_columns = ['revenue', 'revenue-1', 'revenue-2', 'revenue-3', 'revenuePY', 'revenuePY-1', 'revenuePY-2', 'revenuePY-3',
                   'revenuePY+2', 'revenuePY+1', 'carnival', 'easter', 'ascension_day', 'whitsunday', 'christmas', 'new_year']
target_column = 'revenue+2'

# Training-Features und -Zielvariable extrahieren
X_train_public = train_df_public[feature_columns].values
y_train_public = train_df_public[target_column].values

# Validierungs-Features und -Zielvariable extrahieren
X_val_public = val_df_public[feature_columns].values
y_val_public = val_df_public[target_column].values

# Test-Features und -Zielvariable extrahieren
X_test_public = test_df_public[feature_columns].values
y_test_public = test_df_public[target_column].values

# Define the hyperparameter grid manually
param_grid = {
    'n_estimators': [100, 300, 500, 1000],  # Anzahl der Bäume im Wald
    'max_features': ['sqrt', 'log2', None],
    'min_samples_leaf': [5, 10, 20],
    'max_depth': [None, 10, 20, 30],
    'max_samples': [0.6, 0.8, 1.0]
}

# Create all combinations of hyperparameters
combinations = list(product(param_grid['n_estimators'], 
                            param_grid['max_features'], 
                            param_grid['min_samples_leaf'], 
                            param_grid['max_depth'], 
                            param_grid['max_samples']))

# Track the best model
best_model_public = None
best_mre_public = float('inf')  # Set to a large value to minimize it
best_params_public = {}

# Progress counter
total_combinations = len(combinations)

# Manual loop for hyperparameter search
for i, (n_estimators, max_features, min_samples_leaf, max_depth, max_samples) in enumerate(combinations):
    print(f"Testing combination {i + 1}/{total_combinations}: n_estimators={n_estimators}, max_features={max_features}, "
          f"min_samples_leaf={min_samples_leaf}, max_depth={max_depth}, max_samples={max_samples}")
    
    # Create and train the model on the train set
    model_public = RandomForestRegressor(
        n_estimators=n_estimators,  # Anzahl der Bäume im Wald
        max_features=max_features,
        min_samples_leaf=min_samples_leaf,
        max_depth=max_depth,
        max_samples=max_samples,
        random_state=42,
        n_jobs=-1
    )
    
    # Train the model on the train set
    model_public.fit(X_train_public, y_train_public)
    
    # Validate the model on the validation set
    y_val_pred_public = model_public.predict(X_val_public)
    
    # Calculate the Mean Relative Error (MRE) on the validation set
    relative_errors_val_public = np.abs((y_val_public - y_val_pred_public) / y_val_public)
    val_mre_public = np.mean(relative_errors_val_public)
    
    print(f"Validation MRE: {val_mre_public}")
    
    # Check if this model is the best so far based on MRE
    if val_mre_public < best_mre_public:
        best_mre_public = val_mre_public
        best_model_public = model_public
        best_params_public = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'min_samples_leaf': min_samples_leaf,
            'max_depth': max_depth,
            'max_samples': max_samples
        }

    print(f"Current best MRE: {best_mre_public}")

# Output the best hyperparameters after validation
print(f"\nBest hyperparameters after validation: {best_params_public}")
print(f"Best validation MRE: {best_mre_public}")

# Now evaluate the best model on the test set
y_test_pred_public = best_model_public.predict(X_test_public)

# Calculate the Mean Relative Error on the test set
relative_errors_test_public = np.abs((y_test_public - y_test_pred_public) / y_test_public)
mean_relative_error_test_public = np.mean(relative_errors_test_public)

print(f"\nMean Relative Error on the test set: {mean_relative_error_test_public}")

In [None]:
print(f"Best Parameter Public: {best_params_public}")
print(f"\nBest validation MRE Public: {best_mre_public}")
print(f"\nMean Relative Error on the test set Public: {mean_relative_error_test_public}")
print(f"\nMean Relative Error on the train set Public: {mean_relative_error_train_public}")

In [None]:
importances = best_model_public.feature_importances_

# Optional: Anzeige mit den Namen der Features
feature_names = feature_columns  # Wenn X_train ein DataFrame ist, enthält dies die Spaltennamen
import pandas as pd

# Erstelle ein DataFrame, um die Feature-Bedeutung zusammen mit den Namen der Variablen anzuzeigen
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sortieren nach der Wichtigkeit, um die wichtigsten Features oben anzuzeigen
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Ausgabe anzeigen
print(feature_importance_df)

**School Shift**

In [None]:
datei_pfad = '../data/school_shift_forecasting.csv'
df_school = pd.read_csv(datei_pfad)

In [None]:
# Erstellen der Test-, Train- und Validation-Sets durch Filtern
train_df_school = df_school[df_school['filiale_baeckerei'].isin(train_indices)]
val_df_school = df_school[df_school['filiale_baeckerei'].isin(val_indices)]
test_df_school = df_school[df_school['filiale_baeckerei'].isin(test_indices)]


In [None]:
# Relevante Spalten auswählen
feature_columns = ['revenue', 'revenue-1', 'revenue-2', 'revenue-3', 'revenuePY', 'revenuePY-1', 'revenuePY-2', 'revenuePY-3',
                   'revenuePY+2', 'revenuePY+1', 'carnival', 'easter', 'ascension_day', 'whitsunday', 'christmas', 'new_year']
target_column = 'revenue+2'

# Training-Features und -Zielvariable extrahieren
X_train_school = train_df_school[feature_columns].values
y_train_school = train_df_school[target_column].values

# Validierungs-Features und -Zielvariable extrahieren
X_val_school = val_df_school[feature_columns].values
y_val_school = val_df_school[target_column].values

# Test-Features und -Zielvariable extrahieren
X_test_school = test_df_school[feature_columns].values
y_test_school = test_df_school[target_column].values

# Define the hyperparameter grid manually
param_grid = {
    'n_estimators': [100, 300, 500, 1000],  # Anzahl der Bäume im Wald
    'max_features': ['sqrt', 'log2', None],
    'min_samples_leaf': [5, 10, 20],
    'max_depth': [None, 10, 20, 30],
    'max_samples': [0.6, 0.8, 1.0]
}

# Create all combinations of hyperparameters
combinations = list(product(param_grid['n_estimators'], 
                            param_grid['max_features'], 
                            param_grid['min_samples_leaf'], 
                            param_grid['max_depth'], 
                            param_grid['max_samples']))

# Track the best model
best_model_school = None
best_mre_school = float('inf')  # Set to a large value to minimize it
best_params_school = {}

# Progress counter
total_combinations = len(combinations)

# Manual loop for hyperparameter search
for i, (n_estimators, max_features, min_samples_leaf, max_depth, max_samples) in enumerate(combinations):
    print(f"Testing combination {i + 1}/{total_combinations}: n_estimators={n_estimators}, max_features={max_features}, "
          f"min_samples_leaf={min_samples_leaf}, max_depth={max_depth}, max_samples={max_samples}")
    
    # Create and train the model on the train set
    model_school = RandomForestRegressor(
        n_estimators=n_estimators,  # Anzahl der Bäume im Wald
        max_features=max_features,
        min_samples_leaf=min_samples_leaf,
        max_depth=max_depth,
        max_samples=max_samples,
        random_state=42,
        n_jobs=-1
    )
    
    # Train the model on the train set
    model_school.fit(X_train_school, y_train_school)
    
    # Validate the model on the validation set
    y_val_pred_school = model_school.predict(X_val_school)
    
    # Calculate the Mean Relative Error (MRE) on the validation set
    relative_errors_val_school = np.abs((y_val_school - y_val_pred_school) / y_val_school)
    val_mre_school = np.mean(relative_errors_val_school)
    
    print(f"Validation MRE: {val_mre_school}")
    
    # Check if this model is the best so far based on MRE
    if val_mre_school < best_mre_school:
        best_mre_school = val_mre_school
        best_model_school = model_school
        best_params_school= {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'min_samples_leaf': min_samples_leaf,
            'max_depth': max_depth,
            'max_samples': max_samples
        }

    print(f"Current best MRE: {best_mre_school}")

# Output the best hyperparameters after validation
print(f"\nBest hyperparameters after validation: {best_params_school}")
print(f"Best validation MRE: {best_mre_school}")

# Now evaluate the best model on the test set
y_test_pred_school = best_model_school.predict(X_test_school)

# Calculate the Mean Relative Error on the test set
relative_errors_test_school = np.abs((y_test_school - y_test_pred_school) / y_test_school)
mean_relative_error_test_school = np.mean(relative_errors_test_school)

print(f"\nMean Relative Error on the test set: {mean_relative_error_test_school}")

In [None]:
print(f"Best Parameter School: {best_params_school}")
print(f"\nBest validation MRE School: {best_mre_school}")
print(f"\nMean Relative Error on the test set School: {mean_relative_error_test_school}")
print(f"\nMean Relative Error on the train set School: {mean_relative_error_train_school}")

In [None]:
importances = best_model_school.feature_importances_

# Optional: Anzeige mit den Namen der Features
feature_names = feature_columns  # Wenn X_train ein DataFrame ist, enthält dies die Spaltennamen
import pandas as pd

# Erstelle ein DataFrame, um die Feature-Bedeutung zusammen mit den Namen der Variablen anzuzeigen
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sortieren nach der Wichtigkeit, um die wichtigsten Features oben anzuzeigen
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Ausgabe anzeigen
print(feature_importance_df)