## Load dataset

In [1]:
import pandas as pd

# Load the dataset
file_path = 'store_data.csv'
df = pd.read_csv(file_path)

# Display the first few rows to inspect the dataset
df.head()


Unnamed: 0,date,store_no,region,city,postal,street,longitude,latitude,store_area,location_type,...,footfall,avg_temperature,min_temperature,max_temperature,precipitation_mm,snow_depth_mm,wind_direction_degrees,wind_speed_kmh,peak_wind_gust_kmh,air_pressure_hpa
0,2021-02-12,2335,Bayern,Lauf,91207,Marktplatz 15,11.281178,49.511979,449.0,Fußgängerzone,...,,-10.9,-18.0,-2.8,0.0,130.0,79.0,14.2,35.3,1033.1
1,2021-03-01,197,Rheinland-pfalz,Bitburg,54634,Trierer Str.19,6.523312,49.970275,263.0,Hauptgeschaeftsstrasse,...,,6.6,0.5,14.0,0.0,,73.0,13.1,,1032.6
2,2021-03-01,441,Rheinland-pfalz,Annweiler,76855,Hauptstr. 35,7.96342,49.202622,350.0,Hauptgeschaeftsstrasse,...,,7.4,2.4,12.8,0.0,0.0,45.0,12.5,,1032.5
3,2021-03-01,632,Saarland,Lebach,66822,Am Markt 6,6.906569,49.411052,363.0,Fußgängerzone,...,3.0,5.8,0.6,12.1,0.0,0.0,74.0,13.9,41.4,1032.0
4,2021-03-01,1218,Rheinland-pfalz,Traben-Trarbach,56841,Poststr. 1 A,7.116635,49.950358,354.0,Hauptgeschaeftsstrasse,...,3.0,5.5,0.3,12.4,0.0,0.0,51.0,11.2,40.3,1033.0


## Handling Missing Values

In [2]:
# Fill missing values in 'footfall' column using the median
df['footfall'] = df['footfall'].fillna(df['footfall'].median())

# Confirm that missing values are filled
print(f"Missing values in 'footfall': {df['footfall'].isnull().sum()}")


Missing values in 'footfall': 0


## Handling Outliers

In [3]:
# Handle outliers by capping extreme values at the 99th percentile
for column in ['footfall', 'turnover']:
    df[column] = df[column].clip(upper=df[column].quantile(0.99))

# Confirm changes
print(df[['footfall', 'turnover']].describe())


            footfall       turnover
count  414645.000000  414645.000000
mean       19.512721     134.545291
std         6.448615      53.635114
min         2.000000      -4.101720
25%        18.000000      97.487100
50%        19.000000     129.456120
75%        20.000000     165.893940
max        43.000000     302.593027


## Add Features

In [4]:
# Add interaction feature: store_area * footfall
df['store_area_footfall'] = df['store_area'] * df['footfall']

# Add month as a new feature
df['month'] = pd.to_datetime(df['date']).dt.month

# Group low-frequency cities into an 'Other' category
top_cities = df['city'].value_counts().nlargest(50).index
df['city'] = df['city'].apply(lambda x: x if x in top_cities else 'Other')


## Define Features and targets

In [5]:
# Updated feature set
selected_features = [
    'store_area', 'footfall', 'avg_temperature', 'city', 
    'precipitation_mm', 'air_pressure_hpa', 'wind_speed_kmh',
    'store_area_footfall', 'month'
]
X = df[selected_features]
y = df['turnover']


## Preprocessing for LightGBM

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), [
            'store_area', 'footfall', 'avg_temperature', 'precipitation_mm',
            'air_pressure_hpa', 'wind_speed_kmh', 'store_area_footfall', 'month'
        ]),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['city'])
    ]
)

# Define the LightGBM model
lightgbm_model = LGBMRegressor(random_state=42, n_estimators=500, learning_rate=0.05)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lightgbm_model)
])


## Train and Evaluate LightGBM

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"LightGBM Model Performance:\nMAE: {mae:.2f}\nRMSE: {rmse:.2f}\nR² (Accuracy): {r2:.2f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.044528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1674
[LightGBM] [Info] Number of data points in the train set: 331716, number of used features: 59
[LightGBM] [Info] Start training from score 134.588988
LightGBM Model Performance:
MAE: 30.28
RMSE: 39.56
R² (Accuracy): 0.45


## Prepare data for LSTM

In [9]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Select numerical features for LSTM
numerical_features = ['store_area', 'footfall', 'avg_temperature', 'precipitation_mm',
                      'air_pressure_hpa', 'wind_speed_kmh', 'store_area_footfall', 'month']

# One-hot encode the city feature
city_encoded = pd.get_dummies(df['city'], drop_first=True)

# Combine numerical and encoded categorical features
X_lstm = pd.concat([df[numerical_features], city_encoded], axis=1).values
y_lstm = df['turnover'].values

# Normalize features for LSTM
scaler = MinMaxScaler()
X_lstm_scaled = scaler.fit_transform(X_lstm)

# Reshape for LSTM (samples, timesteps, features)
X_lstm_scaled = X_lstm_scaled.reshape(X_lstm_scaled.shape[0], 1, X_lstm_scaled.shape[1])


## Build and Train LSTM

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout

# Define LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(units=64, activation='relu', input_shape=(1, X_lstm_scaled.shape[2])))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(units=32, activation='relu'))
lstm_model.add(Dense(units=1))  # Output layer for regression

# Compile the model
lstm_model.compile(optimizer='adam', loss='mean_squared_error')

# Split data for training and testing
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(
    X_lstm_scaled, y_lstm, test_size=0.2, random_state=42
)

# Train the model
history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=32, validation_split=0.2, verbose=1)


  super().__init__(**kwargs)


Epoch 1/50
[1m8293/8293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 12893.7480 - val_loss: 18830.7207
Epoch 2/50
[1m8293/8293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 2ms/step - loss: 18486.0117 - val_loss: 16825.3359
Epoch 3/50
[1m8293/8293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 16536.3906 - val_loss: 14957.3828
Epoch 4/50
[1m8293/8293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 14710.3535 - val_loss: 13224.4502
Epoch 5/50
[1m8293/8293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 12990.8799 - val_loss: 11628.5684
Epoch 6/50
[1m8293/8293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 11366.0908 - val_loss: 10167.6055
Epoch 7/50
[1m8293/8293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 9953.0469 - val_loss: 8842.5898
Epoch 8/50
[1m8293/8293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/s

## Evaluate LSTM

In [None]:
# Make predictions
y_pred_lstm = lstm_model.predict(X_test_lstm)

# Evaluate LSTM model
mae = mean_absolute_error(y_test_lstm, y_pred_lstm)
rmse = np.sqrt(mean_squared_error(y_test_lstm, y_pred_lstm))
r2 = r2_score(y_test_lstm, y_pred_lstm)

print(f"LSTM Model Performance:\nMAE: {mae:.2f}\nRMSE: {rmse:.2f}\nR² (Accuracy): {r2:.2f}")


[1m2592/2592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
LSTM Model Performance:
MAE: 115.46
RMSE: 127.17
R² (Accuracy): -4.65
