<a href="https://colab.research.google.com/github/ayadavdev/rohlik-orders-forecasting/blob/main/Rohlik_Orders_Forecasting_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

# read csv file
data = pd.read_csv('rohlik-data - train-v4.csv')

# find the nulls in the data
print("nulls:", data.isnull().sum())

# fill the nulls with the mean of the column
# in this dataset only 'precipitation' and 'snow' features had null values
data['precipitation'] = data['precipitation'].fillna(data['precipitation'].mean())
data['snow'] = data['snow'].fillna(data['snow'].mean())

# remove unwanted features and target variable
# X = data.drop(columns=['warehouse','date', 'orders', 'holiday_name', 'id'], axis=1)

X = data.drop(columns=[
    'warehouse', 'date', 'holiday_name', 'id', 'orders',
    'snow',	'frankfurt_shutdown',	'user_activity_1',	'user_activity_2',	'mini_shutdown',	'precipitation',	'blackout',	'shutdown',	'mov_change'
], axis=1)

# set target column
y = data['orders']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reset indices after splitting to ensure sequential integer indices
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

# Print the first row of X_train and y_train
print(X_train.iloc[0])
print(y_train.iloc[0])

# Define the model
model = tf.keras.Sequential([
  tf.keras.layers.Dense(64, activation='relu'),
  tf.keras.layers.Dense(32, activation='relu'),
  tf.keras.layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss=tf.keras.losses.MAPE)  # Use mean squared error for regression

# Train the model
model.fit(X_train, y_train, epochs=250, batch_size=32)  # Adjust epochs and batch size as needed

# Evaluate the model
loss = model.evaluate(X_test, y_test)
print("MAPE:", loss)



nulls: warehouse                    0
date                         0
orders                       0
holiday_name              7122
holiday                      0
shutdown                     0
mini_shutdown                0
shops_closed                 0
winter_school_holidays       0
school_holidays              0
blackout                     0
mov_change                   0
frankfurt_shutdown           0
precipitation              270
snow                       270
user_activity_1              0
user_activity_2              0
id                           0
warehouse_id                 0
holiday_name_id              0
weekday                      0
month                        0
dtype: int64
holiday                    0
shops_closed               0
winter_school_holidays     0
school_holidays            0
warehouse_id              51
holiday_name_id            0
weekday                    1
month                      2
Name: 0, dtype: int64
7531
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epo

In [8]:
original_predict_data = pd.read_csv('rohlik-data - test-v1.csv')
predict_data = pd.read_csv('rohlik-data - test-v1.csv')

# find the nulls in the data
print("nulls:", predict_data.isnull().sum())
"""
# fill the nulls with the mean of the column
data['precipitation'] = data['precipitation'].fillna(data['precipitation'].mean())
data['snow'] = data['snow'].fillna(data['snow'].mean())
"""
# remove unwanted features and target variable
predict_data = predict_data.drop(columns=['warehouse','date', 'holiday_name', 'id'], axis=1)


nulls: warehouse                   0
date                        0
holiday_name              380
holiday                     0
shops_closed                0
winter_school_holidays      0
school_holidays             0
id                          0
warehouse_id                0
holiday_name_id             0
weekday                     0
month                       0
dtype: int64


In [9]:
# check if predition data has same columns as training data
train_cols = set(X_train.columns)
predict_cols = set(predict_data.columns)

if train_cols == predict_cols:
    print("Training and prediction data have the same columns.")
else:
    print("Training and prediction data have different columns.")
    print("Missing in training data:", predict_cols - train_cols)
    print("Missing in prediction data:", train_cols - predict_cols)

Training and prediction data have the same columns.


In [5]:
pip install eli5

Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.2/216.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107720 sha256=2ffcb4bf4b1b8c561991bd1d377a7e029dd5cc1b9c3828cd669a99337a230f1f
  Stored in directory: /root/.cache/pip/wheels/b8/58/ef/2cf4c306898c2338d51540e0922c8e0d6028e07007085c0004
Successfully built eli5
Installing collected packages: eli5
Successfully installed eli5-0.13.0


In [10]:
# Print feature importances (sort by importance)

from tensorflow.keras import Model

class KerasWrapper(Model):
  def __init__(self, keras_model):
    super().__init__()
    self.keras_model = keras_model

  def call(self, inputs):
    return self.keras_model(inputs)

  def score(self, X, y): # Add a score method to your KerasWrapper
      from sklearn.metrics import mean_absolute_percentage_error
      y_pred = self.keras_model.predict(X)
      return mean_absolute_percentage_error(y, y_pred)  # Example using mean squared error, adjust as needed

from eli5.sklearn import PermutationImportance
from tensorflow import keras

# Create a wrapper for your Keras model
keras_model = KerasWrapper(model)

# Permutation importance explainer
perm = PermutationImportance(keras_model, random_state=1)

# Fit the explainer on your validation data (X_val, y_val)
perm.fit(X_train, y_train)

# Get feature importances
importances = perm.feature_importances_

# Print feature importances (sort by importance)
print(sorted(zip(X_train.columns, importances), key=lambda x: x[1], reverse=True))

[('holiday', -0.0005417564249520723), ('school_holidays', -0.0006423192586140547), ('shops_closed', -0.000749642525414812), ('holiday_name_id', -0.0020702881960952156), ('winter_school_holidays', -0.002567091018228479), ('weekday', -0.005016943866726481), ('month', -0.007689722397170762), ('warehouse_id', -0.35010501232538144)]


In [12]:
predictions = model.predict(predict_data)

# Round up predictions using tf.math.ceil
predictions = tf.math.ceil(predictions).numpy().astype(int)

# Convert predictions to a DataFrame with the column named 'prediction'
predictions_df = pd.DataFrame(predictions, columns=['orders'])

# Add the 'prediction' column to predict_data
final_result_data = pd.concat([original_predict_data, predictions_df], axis=1)

# View the updated predict_data
print(final_result_data.head())

solution_data = final_result_data[['id', 'orders']]
solution_data.to_csv('submission.csv', index=False)

  warehouse        date holiday_name  holiday  shops_closed  \
0  Prague_1  2024-03-16          NaN        0             0   
1  Prague_1  2024-03-17          NaN        0             0   
2  Prague_1  2024-03-18          NaN        0             0   
3  Prague_1  2024-03-19          NaN        0             0   
4  Prague_1  2024-03-20          NaN        0             0   

   winter_school_holidays  school_holidays                   id  warehouse_id  \
0                       0                0  Prague_1_2024-03-16            51   
1                       0                0  Prague_1_2024-03-17            51   
2                       0                0  Prague_1_2024-03-18            51   
3                       0                0  Prague_1_2024-03-19            51   
4                       0                0  Prague_1_2024-03-20            51   

   holiday_name_id  weekday  month  orders  
0                0        6      3    4855  
1                0        7      3    4892  