In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [2]:
file_path = "store_data_2023.csv"  # Replace with your file path
data = pd.read_csv(file_path)

## Step 1: Preprocess the data

In [7]:
# List all columns in the dataset
print("Original dataset columns:", data.columns.tolist())


Original dataset columns: ['date', 'store_no', 'region', 'city', 'postal', 'street', 'longitude', 'latitude', 'store_area', 'location_type', 'turnover', 'population', 'competitor_count', 'footfall', 'avg_temperature', 'min_temperature', 'max_temperature', 'precipitation_mm', 'snow_depth_mm', 'wind_direction_degrees', 'wind_speed_kmh', 'peak_wind_gust_kmh', 'air_pressure_hpa']


In [17]:

# Identify control and predictor features
control_features = ['date', 'store_area', 'avg_temperature', 'precipitation_mm', 'wind_speed_kmh']
predictor_features = ['latitude', 'longitude', 'air_pressure_hpa', 'store_no']

In [18]:

# Combine control and predictor features
features_to_use = control_features + predictor_features

In [19]:

# Handle categorical encoding
categorical_features = [col for col in ['region', 'location_type'] if col in data.columns]
print("Categorical features to encode:", categorical_features)

Categorical features to encode: ['region', 'location_type']


In [20]:

# One-hot encode categorical features
data_encoded = pd.get_dummies(data, columns=categorical_features, drop_first=True)

In [21]:

# Log the columns in the processed dataset
print("Encoded dataset columns:", data_encoded.columns.tolist())

# Ensure the target variable is included
if 'turnover' not in data_encoded.columns:
    raise ValueError("The target feature 'turnover' is not in the dataset.")

Encoded dataset columns: ['date', 'store_no', 'city', 'postal', 'street', 'longitude', 'latitude', 'store_area', 'turnover', 'population', 'competitor_count', 'footfall', 'avg_temperature', 'min_temperature', 'max_temperature', 'precipitation_mm', 'snow_depth_mm', 'wind_direction_degrees', 'wind_speed_kmh', 'peak_wind_gust_kmh', 'air_pressure_hpa', 'region_Bayern', 'region_Berlin', 'region_Brandenburg', 'region_Bremen', 'region_Hamburg', 'region_Hessen', 'region_Mecklenburg-vorpommern', 'region_Niedersachsen', 'region_Nordrhein-westfalen', 'region_Rheinland-pfalz', 'region_Saarland', 'region_Sachsen', 'region_Sachsen-anhalt', 'region_Schleswig-holstein', 'region_Thüringen', 'location_type_Einkaufszentrum', 'location_type_Fachmarktzentrum', 'location_type_Fußgängerzone', 'location_type_Hauptgeschaeftsstrasse', 'location_type_Shopping Mall']


In [22]:
# Extract necessary columns, excluding region and location_type after encoding
features_to_use = [col for col in features_to_use if col in data_encoded.columns]
print("Features used for training:", features_to_use)

Features used for training: ['date', 'store_area', 'avg_temperature', 'precipitation_mm', 'wind_speed_kmh', 'latitude', 'longitude', 'air_pressure_hpa', 'store_no']


In [23]:

# Prepare the final dataset for training
data_model = data_encoded[features_to_use + ['turnover']].dropna()


In [24]:

# Extract features and target
X = data_model.drop(columns=['turnover'])
y = data_model['turnover']

In [27]:
# Ensure the 'date' column is converted to a numerical representation
if 'date' in X.columns:
    X['year'] = pd.to_datetime(X['date']).dt.year
    X['month'] = pd.to_datetime(X['date']).dt.month
    X['day'] = pd.to_datetime(X['date']).dt.day
    X = X.drop(columns=['date'])  # Drop the original 'date' column after transformation

# Scale numeric features (excluding non-numeric columns like 'date')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [28]:

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [31]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)


In [32]:

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

## Step 2: Train LightGBM Model

In [37]:
from lightgbm import early_stopping, log_evaluation

print("Training LightGBM model...")
# Train with callbacks
lgb_model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,  # Set a high maximum boosting round
    valid_sets=[lgb_test],  # Specify validation data
    callbacks=[
        early_stopping(stopping_rounds=50),  # Stop early if no improvement in 50 rounds
        log_evaluation(period=10)  # Log evaluation every 10 rounds
    ]
)


Training LightGBM model...
Training until validation scores don't improve for 50 rounds
[10]	valid_0's rmse: 49.8642
[20]	valid_0's rmse: 47.3447
[30]	valid_0's rmse: 45.8902
[40]	valid_0's rmse: 44.7165
[50]	valid_0's rmse: 43.8143
[60]	valid_0's rmse: 43.0513
[70]	valid_0's rmse: 42.3792
[80]	valid_0's rmse: 41.795
[90]	valid_0's rmse: 41.2816
[100]	valid_0's rmse: 40.9054
[110]	valid_0's rmse: 40.5416
[120]	valid_0's rmse: 40.2058
[130]	valid_0's rmse: 39.9669
[140]	valid_0's rmse: 39.7195
[150]	valid_0's rmse: 39.5245
[160]	valid_0's rmse: 39.3305
[170]	valid_0's rmse: 39.1501
[180]	valid_0's rmse: 38.9978
[190]	valid_0's rmse: 38.874
[200]	valid_0's rmse: 38.7554
[210]	valid_0's rmse: 38.6351
[220]	valid_0's rmse: 38.547
[230]	valid_0's rmse: 38.4464
[240]	valid_0's rmse: 38.3426
[250]	valid_0's rmse: 38.2359
[260]	valid_0's rmse: 38.141
[270]	valid_0's rmse: 38.0703
[280]	valid_0's rmse: 37.9786
[290]	valid_0's rmse: 37.9259
[300]	valid_0's rmse: 37.865
[310]	valid_0's rmse: 37.8

## Step 3: Train LSTM Model

In [38]:
# Prepare data for LSTM (convert to sequences)
def create_sequences(features, target, seq_length=10):
    X_seq, y_seq = [], []
    for i in range(len(features) - seq_length):
        X_seq.append(features[i:i + seq_length])
        y_seq.append(target[i + seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 10
X_seq, y_seq = create_sequences(X_scaled, y.values, seq_length)

In [39]:
# Split sequences into training and testing sets
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)


In [40]:
# Build LSTM model
lstm_model = Sequential([
    LSTM(64, input_shape=(seq_length, X_train_seq.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(1)
])

lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])


  super().__init__(**kwargs)


In [42]:

print("Training LSTM model...")
history = lstm_model.fit(X_train_seq, y_train_seq, validation_data=(X_test_seq, y_test_seq), 
                         epochs=10, batch_size=32, verbose=1)

Training LSTM model...
Epoch 1/10
[1m2235/2235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 10ms/step - loss: 2163.6760 - mae: 34.6944 - val_loss: 1910.2665 - val_mae: 33.2803
Epoch 2/10
[1m2235/2235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 11ms/step - loss: 2082.2468 - mae: 34.1423 - val_loss: 1876.8640 - val_mae: 32.8744
Epoch 3/10
[1m2235/2235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 11ms/step - loss: 2085.3640 - mae: 33.8249 - val_loss: 1868.4172 - val_mae: 32.8016
Epoch 4/10
[1m2235/2235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 11ms/step - loss: 2014.5327 - mae: 33.5916 - val_loss: 1851.2665 - val_mae: 32.7417
Epoch 5/10
[1m2235/2235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 10ms/step - loss: 1923.2200 - mae: 33.1505 - val_loss: 1841.4216 - val_mae: 32.6193
Epoch 6/10
[1m2235/2235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 11ms/step - loss: 1928.1167 - mae: 33.0279 - val_loss: 1839.9766 - val_mae: 3

In [43]:
# Step 4: Evaluate both models
lgb_preds = lgb_model.predict(X_test)
lstm_preds = lstm_model.predict(X_test_seq)

lgb_rmse = np.sqrt(mean_squared_error(y_test, lgb_preds))
lstm_rmse = np.sqrt(mean_squared_error(y_test_seq, lstm_preds))

print(f"LightGBM RMSE: {lgb_rmse}")
print(f"LSTM RMSE: {lstm_rmse}")

[1m559/559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
LightGBM RMSE: 36.42481195124668
LSTM RMSE: 42.026432517119694


In [47]:
import joblib
# Save the models for deployment
lgb_model_path = "lightgbm_turnover_model.pkl"
joblib.dump(lgb_model, lgb_model_path)

lstm_model_path = "lstm_turnover_model.h5"
lstm_model.save(lstm_model_path)



In [48]:
lstm_model = tf.keras.models.load_model("lstm_turnover_model.h5", custom_objects={"mse": tf.keras.losses.MeanSquaredError()})




In [49]:
# Import necessary library
from sklearn.metrics import accuracy_score

# Define a tolerance for regression accuracy
tolerance = 0.1  # Example: predictions within 10% of the true value are considered accurate

# Calculate accuracy for LightGBM
lgb_accuracy = np.mean(np.abs(lgb_preds - y_test) <= tolerance * np.abs(y_test)) * 100

# Calculate accuracy for LSTM
lstm_accuracy = np.mean(np.abs(lstm_preds - y_test_seq) <= tolerance * np.abs(y_test_seq)) * 100

# Print RMSE and Accuracy for both models
print(f"LightGBM RMSE: {lgb_rmse}")
print(f"LightGBM Accuracy: {lgb_accuracy:.2f}%")
print(f"LSTM RMSE: {lstm_rmse}")
print(f"LSTM Accuracy: {lstm_accuracy:.2f}%")

LightGBM RMSE: 36.42481195124668
LightGBM Accuracy: 32.71%
LSTM RMSE: 42.026432517119694
LSTM Accuracy: 19.21%
