In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.preprocessing import OneHotEncoder

# Loading the data
data = pd.read_csv('train_0irEZ2H.csv')

In [2]:
# Preprocessing the data
data['week'] = pd.to_datetime(data['week'], format='%d/%m/%y')
data['week_of_year'] = data['week'].dt.isocalendar().week
data['year'] = data['week'].dt.year
data['store_id'] = data['store_id'].astype('category')
data['sku_id'] = data['sku_id'].astype('category')

In [3]:
# Feature Engineering
data['discount'] = data['base_price'] - data['total_price']
data['discount_pct'] = data['discount'] / data['base_price']

# Creating lag features (sales over past 4 weeks)
data['units_sold_lag1'] = data.groupby(['store_id', 'sku_id'])['units_sold'].shift(1)
data['units_sold_lag2'] = data.groupby(['store_id', 'sku_id'])['units_sold'].shift(2)
data['units_sold_lag3'] = data.groupby(['store_id', 'sku_id'])['units_sold'].shift(3)
data['units_sold_lag4'] = data.groupby(['store_id', 'sku_id'])['units_sold'].shift(4)

# Moving averages
data['ma_units_sold_3'] = (
    data.groupby(['store_id', 'sku_id'])['units_sold']
    .rolling(window=3)
    .mean()
    .shift(1)
    .reset_index(level=[0, 1], drop=True)
)

data = data.dropna()

  data['units_sold_lag1'] = data.groupby(['store_id', 'sku_id'])['units_sold'].shift(1)
  data['units_sold_lag2'] = data.groupby(['store_id', 'sku_id'])['units_sold'].shift(2)
  data['units_sold_lag3'] = data.groupby(['store_id', 'sku_id'])['units_sold'].shift(3)
  data['units_sold_lag4'] = data.groupby(['store_id', 'sku_id'])['units_sold'].shift(4)
  data.groupby(['store_id', 'sku_id'])['units_sold']


In [4]:
# Selecting features and target
features = ['week_of_year', 'year', 'store_id', 'sku_id', 'total_price', 'base_price', 
            'is_featured_sku', 'is_display_sku', 'discount', 'discount_pct',
            'units_sold_lag1', 'units_sold_lag2', 'units_sold_lag3', 'units_sold_lag4', 'ma_units_sold_3']
target = 'units_sold'

X = data[features]
y = data[target]

X = pd.get_dummies(X, columns=['store_id', 'sku_id'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Model Training - Using a Random Forest Regressor

param_dist = {
    'n_estimators': randint(50, 150),
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 10)
}


# Initializing the model
model = RandomForestRegressor(random_state=42)

# Performing RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
                                    n_iter=20, cv=3, scoring='neg_mean_absolute_error', 
                                    n_jobs=-1, verbose=2, random_state=42)

random_search.fit(X_train, y_train)

print(f'Best Parameters: {random_search.best_params_}')
print(f'Best Score (Negative MAE): {random_search.best_score_}')

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END max_depth=20, min_samples_split=5, n_estimators=142; total time= 7.7min
[CV] END max_depth=20, min_samples_split=4, n_estimators=121; total time= 6.1min
[CV] END max_depth=20, min_samples_split=4, n_estimators=124; total time= 6.4min
[CV] END max_depth=None, min_samples_split=5, n_estimators=73; total time= 4.6min
[CV] END max_depth=20, min_samples_split=7, n_estimators=102; total time= 4.8min
[CV] END max_depth=10, min_samples_split=3, n_estimators=113; total time= 2.9min
[CV] END max_depth=None, min_samples_split=2, n_estimators=125; total time= 7.6min
[CV] END max_depth=None, min_samples_split=4, n_estimators=108; total time= 6.2min
[CV] END max_depth=20, min_samples_split=7, n_estimators=111; total time= 5.2min
[CV] END max_depth=20, min_samples_split=7, n_estimators=111; total time= 4.4min
[CV] END max_depth=20, min_samples_split=5, n_estimators=113; total time= 4.5min
[CV] END max_depth=None, min_samples_split=

In [6]:
# Training the best model
best_model = random_search.best_estimator_

best_model.fit(X_train, y_train)

[CV] END max_depth=20, min_samples_split=4, n_estimators=121; total time= 6.6min
[CV] END max_depth=20, min_samples_split=4, n_estimators=121; total time= 6.2min
[CV] END max_depth=None, min_samples_split=6, n_estimators=132; total time= 8.4min
[CV] END max_depth=None, min_samples_split=5, n_estimators=73; total time= 4.4min
[CV] END max_depth=20, min_samples_split=7, n_estimators=102; total time= 4.7min
[CV] END max_depth=10, min_samples_split=3, n_estimators=113; total time= 2.9min
[CV] END max_depth=10, min_samples_split=7, n_estimators=138; total time= 3.3min
[CV] END max_depth=10, min_samples_split=7, n_estimators=138; total time= 3.4min
[CV] END max_depth=None, min_samples_split=4, n_estimators=108; total time= 6.2min
[CV] END max_depth=20, min_samples_split=3, n_estimators=141; total time= 6.6min
[CV] END max_depth=20, min_samples_split=7, n_estimators=100; total time= 4.0min
[CV] END max_depth=20, min_samples_split=5, n_estimators=113; total time= 4.4min
[CV] END max_depth=20, 

In [12]:
# Evaluation
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f'Final Model MAE: {mae}')

Final Model MAE: 12.908430976430974


In [13]:
# Comparing predictions with actual values
comparison_df = pd.DataFrame({
    'predicted_units_sold': y_pred,
    'actual_units_sold': y_test.reset_index(drop=True)  # Reset index to align with predictions
})

print(comparison_df.head())

   predicted_units_sold  actual_units_sold
0                36.712                 33
1                62.152                 52
2                11.032                 16
3                28.712                 43
4                33.152                 26
