In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
training_data = pd.read_csv('../Data/RossmannStoreSales/train.csv', low_memory=False)
testing_data = pd.read_csv('../Data/RossmannStoreSales/test.csv')
store_data = pd.read_csv('../Data/RossmannStoreSales/store.csv')

In [3]:
training_data.isna().sum(), testing_data.isna().sum()

(Store            0
 DayOfWeek        0
 Date             0
 Sales            0
 Customers        0
 Open             0
 Promo            0
 StateHoliday     0
 SchoolHoliday    0
 dtype: int64,
 Id                0
 Store             0
 DayOfWeek         0
 Date              0
 Open             11
 Promo             0
 StateHoliday      0
 SchoolHoliday     0
 dtype: int64)

In [4]:
training_data.duplicated().sum(), testing_data.duplicated().sum()

(0, 0)

In [5]:
training_data.info(), testing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   Store          1017209 non-null  int64 
 1   DayOfWeek      1017209 non-null  int64 
 2   Date           1017209 non-null  object
 3   Sales          1017209 non-null  int64 
 4   Customers      1017209 non-null  int64 
 5   Open           1017209 non-null  int64 
 6   Promo          1017209 non-null  int64 
 7   StateHoliday   1017209 non-null  object
 8   SchoolHoliday  1017209 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 69.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             41088 non-null  int64  
 1   Store          41088 non-null  int64  
 2   DayOfWeek      41088 non-null  int64  
 3   Date     

(None, None)

In [6]:
training_data.describe().T, testing_data.describe().T

(                   count         mean          std  min     25%     50%  \
 Store          1017209.0   558.429727   321.908651  1.0   280.0   558.0   
 DayOfWeek      1017209.0     3.998341     1.997391  1.0     2.0     4.0   
 Sales          1017209.0  5773.818972  3849.926175  0.0  3727.0  5744.0   
 Customers      1017209.0   633.145946   464.411734  0.0   405.0   609.0   
 Open           1017209.0     0.830107     0.375539  0.0     1.0     1.0   
 Promo          1017209.0     0.381515     0.485759  0.0     0.0     0.0   
 SchoolHoliday  1017209.0     0.178647     0.383056  0.0     0.0     0.0   
 
                   75%      max  
 Store           838.0   1115.0  
 DayOfWeek         6.0      7.0  
 Sales          7856.0  41551.0  
 Customers       837.0   7388.0  
 Open              1.0      1.0  
 Promo             1.0      1.0  
 SchoolHoliday     0.0      1.0  ,
                  count          mean           std  min       25%      50%  \
 Id             41088.0  20544.500000 

In [7]:
training_data.drop(['Customers'], axis=1, inplace=True)

In [8]:
training_data[['Open', 'Promo', 'StateHoliday', 'SchoolHoliday']] = training_data[['Open', 'Promo', 'StateHoliday', 'SchoolHoliday']].astype(object)

testing_data[['Open', 'Promo', 'StateHoliday', 'SchoolHoliday']] = testing_data[['Open', 'Promo', 'StateHoliday', 'SchoolHoliday']].astype(object)

In [9]:
training_data.info(), testing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   Store          1017209 non-null  int64 
 1   DayOfWeek      1017209 non-null  int64 
 2   Date           1017209 non-null  object
 3   Sales          1017209 non-null  int64 
 4   Open           1017209 non-null  object
 5   Promo          1017209 non-null  object
 6   StateHoliday   1017209 non-null  object
 7   SchoolHoliday  1017209 non-null  object
dtypes: int64(3), object(5)
memory usage: 62.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             41088 non-null  int64 
 1   Store          41088 non-null  int64 
 2   DayOfWeek      41088 non-null  int64 
 3   Date           41088 non-null  object
 4   Open           4

(None, None)

In [10]:
training_data['Date'] = pd.to_datetime(training_data['Date'])

testing_data['Date'] = pd.to_datetime(testing_data['Date'])

In [11]:
training_data = training_data.merge(store_data, how='left', on='Store')

testing_data = testing_data.merge(store_data, how='left', on='Store')

In [12]:
training_data['Promo2'] = training_data['Promo2'].astype(object)

testing_data['Promo2'] = testing_data['Promo2'].astype(object)

In [13]:
training_data.isna().sum(), testing_data.isna().sum()

(Store                             0
 DayOfWeek                         0
 Date                              0
 Sales                             0
 Open                              0
 Promo                             0
 StateHoliday                      0
 SchoolHoliday                     0
 StoreType                         0
 Assortment                        0
 CompetitionDistance            2642
 CompetitionOpenSinceMonth    323348
 CompetitionOpenSinceYear     323348
 Promo2                            0
 Promo2SinceWeek              508031
 Promo2SinceYear              508031
 PromoInterval                508031
 dtype: int64,
 Id                               0
 Store                            0
 DayOfWeek                        0
 Date                             0
 Open                            11
 Promo                            0
 StateHoliday                     0
 SchoolHoliday                    0
 StoreType                        0
 Assortment                     

In [14]:
training_data.duplicated().sum(), testing_data.duplicated().sum()

(0, 0)

In [15]:
training_data.shape, testing_data.shape

((1017209, 17), (41088, 17))

In [16]:
training_data.info(), testing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 17 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   Store                      1017209 non-null  int64         
 1   DayOfWeek                  1017209 non-null  int64         
 2   Date                       1017209 non-null  datetime64[ns]
 3   Sales                      1017209 non-null  int64         
 4   Open                       1017209 non-null  object        
 5   Promo                      1017209 non-null  object        
 6   StateHoliday               1017209 non-null  object        
 7   SchoolHoliday              1017209 non-null  object        
 8   StoreType                  1017209 non-null  object        
 9   Assortment                 1017209 non-null  object        
 10  CompetitionDistance        1014567 non-null  float64       
 11  CompetitionOpenSinceMonth  693861 non

(None, None)

In [17]:
training_data = training_data[training_data['Open'] == 1] # Sales were 0 if `Open` was 0, therefore to reduce the extra computation for each Open=0 with predicting Sales as 0 and other data features, we directly just ignore/redact the tuples with Open=0.

testing_data = testing_data[testing_data['Open'] == 1] # Sales were 0 if `Open` was 0, therefore to reduce the extra computation for each Open=0 with predicting Sales as 0 and other data features, we directly just ignore/redact the tuples with Open=0.

In [18]:
training_data['CompetitionDistance'].dropna(inplace=True) # removing this had no effect no the `CompetitionOpenSinceYear` and `CompetitionOpenSinceMonth`, it has also helped in taking care of some tuples where `CompetitionOpenSinceYear` or `CompetitionOpenSinceMonth` or both were NaN.

testing_data['CompetitionDistance'].dropna(inplace=True) # removing this had no effect no the `CompetitionOpenSinceYear` and `CompetitionOpenSinceMonth`, it has also helped in taking care of some tuples where `CompetitionOpenSinceYear` or `CompetitionOpenSinceMonth` or both were NaN.

#
##
###
####
#####
###### Experimental:
####### drop all NaNs from the CompetitionOpenSinceYear and CompetitionOpenSinceMonth which are 266433 totals records each resp.
training_data['CompetitionOpenSinceMonth'].dropna(inplace=True)
training_data.dropna(subset=['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], inplace=True)
########
testing_data['CompetitionOpenSinceMonth'].dropna(inplace=True)
testing_data.dropna(subset=['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'], inplace=True)
###### May change on the basis of performance
#####
####
###
##
#

In [19]:
training_data

Unnamed: 0,Store,DayOfWeek,Date,Sales,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,5,2015-07-31,6064,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,5,2015-07-31,8314,1,1,0,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,5,2015-07-31,13995,1,1,0,1,c,c,620.0,9.0,2009.0,0,,,
4,5,5,2015-07-31,4822,1,1,0,1,a,a,29910.0,4.0,2015.0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1016588,494,2,2013-01-01,3113,1,0,a,1,b,a,1260.0,6.0,2011.0,0,,,
1016770,676,2,2013-01-01,3821,1,0,a,1,b,b,1410.0,9.0,2008.0,0,,,
1016776,682,2,2013-01-01,3375,1,0,a,1,b,a,150.0,9.0,2006.0,0,,,
1016827,733,2,2013-01-01,10765,1,0,a,1,b,b,860.0,10.0,1999.0,0,,,


In [20]:
testing_data['Open'] = testing_data['Open'].astype(int)
testing_data

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,1,4,2015-09-17,1,1,0,0,c,a,1270.0,9.0,2008.0,0,,,
1,2,3,4,2015-09-17,1,1,0,0,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
2,3,7,4,2015-09-17,1,1,0,0,a,c,24000.0,4.0,2013.0,0,,,
3,4,8,4,2015-09-17,1,1,0,0,a,a,7520.0,10.0,2014.0,0,,,
4,5,9,4,2015-09-17,1,1,0,0,a,c,2030.0,8.0,2000.0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41080,41081,1106,6,2015-08-01,1,0,0,0,a,c,5330.0,9.0,2011.0,1,31.0,2013.0,"Jan,Apr,Jul,Oct"
41081,41082,1107,6,2015-08-01,1,0,0,0,a,a,1400.0,6.0,2012.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
41082,41083,1109,6,2015-08-01,1,0,0,0,c,a,3490.0,4.0,2011.0,1,22.0,2012.0,"Jan,Apr,Jul,Oct"
41083,41084,1111,6,2015-08-01,1,0,0,0,a,a,1900.0,6.0,2014.0,1,31.0,2013.0,"Jan,Apr,Jul,Oct"


In [21]:
training_data.isna().sum(), testing_data.isna().sum()

(Store                             0
 DayOfWeek                         0
 Date                              0
 Sales                             0
 Open                              0
 Promo                             0
 StateHoliday                      0
 SchoolHoliday                     0
 StoreType                         0
 Assortment                        0
 CompetitionDistance               0
 CompetitionOpenSinceMonth         0
 CompetitionOpenSinceYear          0
 Promo2                            0
 Promo2SinceWeek              307961
 Promo2SinceYear              307961
 PromoInterval                307961
 dtype: int64,
 Id                              0
 Store                           0
 DayOfWeek                       0
 Date                            0
 Open                            0
 Promo                           0
 StateHoliday                    0
 SchoolHoliday                   0
 StoreType                       0
 Assortment                      0
 Compe

In [22]:
training_data['Promo2SinceWeek'].fillna(0, inplace=True)
training_data['Promo2SinceYear'].fillna(0, inplace=True)

testing_data['Promo2SinceWeek'].fillna(0, inplace=True)
testing_data['Promo2SinceYear'].fillna(0, inplace=True)

In [23]:
X = training_data.drop(['Sales'], axis=1)

training_data.sort_values(by='Date', inplace=True)
training_data.reset_index(drop=True, inplace=True)

y = training_data['Sales']

testing_data.drop(['Id'], axis=1, inplace=True)
testing_data.sort_values(by='Date', inplace=True)
testing_data.reset_index(drop=True, inplace=True)

split_idx = int(len(training_data) * 0.8)

X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]

X_val = X.iloc[split_idx:]
y_val = y.iloc[split_idx:]

print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print(f'Earliest date in X_train: {X_train['Date'].min()}')
print(f'Latest date in X_train: {X_train['Date'].max()}', end='\n\n')

print(f'X_val: {X_val.shape}, y_val: {y_val.shape}')
print(f'Earliest date in X_val: {X_val['Date'].min()}')
print(f'Latest date in X_val: {X_val['Date'].max()}', end='\n\n')

X_test = testing_data.copy()

print(f'X_test: {X_test.shape}')
print(f'Earliest date in X_test: {X_test['Date'].min()}')
print(f'Latest date in X_test: {X_test['Date'].max()}')

X_train: (460618, 16), y_train: (460618,)
Earliest date in X_train: 2013-07-03 00:00:00
Latest date in X_train: 2015-07-31 00:00:00

X_val: (115155, 16), y_val: (115155,)
Earliest date in X_val: 2013-01-01 00:00:00
Latest date in X_val: 2013-07-03 00:00:00

X_test: (22090, 16)
Earliest date in X_test: 2015-08-01 00:00:00
Latest date in X_test: 2015-09-17 00:00:00


In [24]:
class Promo2FeatureExtractor(BaseEstimator, TransformerMixin):
	def __init__(self, promo2_week_col='Promo2SinceWeek', promo2_year_col='Promo2SinceYear', main_date_col='Date'):
		self.promo2_week_col = promo2_week_col
		self.promo2_year_col = promo2_year_col
		self.main_date_col = main_date_col
		self.earliest_date = None

	def fit(self, X, y=None):
		if not pd.api.types.is_datetime64_any_dtype(X[self.main_date_col]):
			X[self.main_date_col] = pd.to_datetime(X[self.main_date_col])
		self.earliest_date = X[self.main_date_col].min()
		return self

	def transform(self, X):
		X_transformed = X.copy()
		X_transformed['Promo2Since_WasMissing'] = ( X_transformed[self.promo2_week_col].isna() | X_transformed[self.promo2_year_col].isna() ).astype(int)

		X_transformed[f'{self.promo2_year_col}_filled'] = X_transformed[self.promo2_year_col].fillna(self.earliest_date.year).astype(int)
		# Use 1 for week to ensure valid date conversion, the duration calculation will correct for this
		X_transformed[f'{self.promo2_week_col}_filled'] = X_transformed[self.promo2_week_col].fillna(1).astype(int)

		X_transformed['Promo2SinceDate_str'] = \
			X_transformed[f'{self.promo2_year_col}_filled'].astype(str) + '-W' + \
			X_transformed[f'{self.promo2_week_col}_filled'].astype(str).str.zfill(2) + '-1'

		X_transformed['Promo2SinceDate'] = pd.to_datetime(X_transformed['Promo2SinceDate_str'], format='%Y-W%W-%w', errors='coerce')

		X_transformed['Promo2SinceDate'] = X_transformed['Promo2SinceDate'].fillna(self.earliest_date)

		if not pd.api.types.is_datetime64_any_dtype(X_transformed[self.main_date_col]):
			X_transformed[self.main_date_col] = pd.to_datetime(X_transformed[self.main_date_col])

		X_transformed['Promo2DurationDays'] = (X_transformed[self.main_date_col] - X_transformed['Promo2SinceDate']).dt.days
		X_transformed['Promo2DurationDays'] = X_transformed['Promo2DurationDays'].apply(lambda x: max(0, x))

		X_transformed['Promo2DurationMonths'] = np.floor(X_transformed['Promo2DurationDays'] / 30.437).astype(int)
		X_transformed['Promo2DurationYears'] = np.floor(X_transformed['Promo2DurationDays'] / 365.25).astype(int)

		return X_transformed[[
			'Promo2DurationDays',
			'Promo2DurationMonths',
			'Promo2DurationYears',
			'Promo2Since_WasMissing'
		]]

class DateFeatureExtractor(BaseEstimator, TransformerMixin):
	def __init__(self, date_column='Date', date_format='%Y-%m-%d'):
		self.date_column = date_column
		self.date_format = date_format

	def fit(self, X, y=None):
		return self

	def transform(self, X):
		self.X_transformed = pd.DataFrame(X, columns=[self.date_column])
		self.X_transformed[self.date_column] = pd.to_datetime(self.X_transformed[self.date_column], format=self.date_format)

		extracted_features = pd.DataFrame({
			'Month': self.X_transformed[self.date_column].dt.month,
			'Year': self.X_transformed[self.date_column].dt.year,
			'Day': self.X_transformed[self.date_column].dt.day,
			'Week': self.X_transformed[self.date_column].dt.isocalendar().week.astype(int),
			'IsWeekEnd': (self.X_transformed[self.date_column].dt.dayofweek >= 5).astype(int),
			'IsMonthStart': self.X_transformed[self.date_column].dt.is_month_start.astype(int),
			'IsMonthEnd': self.X_transformed[self.date_column].dt.is_month_end.astype(int),
			'DayName': self.X_transformed[self.date_column].dt.day_name(),
		}, index=self.X_transformed.index)

		return extracted_features

class CompetitionOpenFeatureExtractor(BaseEstimator, TransformerMixin):
	def __init__(self, competition_month_col='CompetitionOpenSinceMonth',
				competition_year_col='CompetitionOpenSinceYear',
				main_date_col='Date'):
		self.competition_month_col = competition_month_col
		self.competition_year_col = competition_year_col
		self.main_date_col = main_date_col
		self.earliest_date = None
		

	def fit(self, X, y=None):
		if not pd.api.types.is_datetime64_any_dtype(X[self.main_date_col]):
			X[self.main_date_col] = pd.to_datetime(X[self.main_date_col])
		self.earliest_date = X[self.main_date_col].min()
		return self

	def transform(self, X):
		X_transformed = X.copy()

		X_transformed[f'{self.competition_year_col}_filled'] = X_transformed[self.competition_year_col].fillna(self.earliest_date.year).astype(int)
		X_transformed[f'{self.competition_month_col}_filled'] = X_transformed[self.competition_month_col].fillna(self.earliest_date.month).astype(int)

		X_transformed['CompetitionOpenSinceDate_str'] = \
			X_transformed[f'{self.competition_year_col}_filled'].astype(str) + '-' + \
			X_transformed[f'{self.competition_month_col}_filled'].astype(str).str.zfill(2) + '-01'
		X_transformed['CompetitionOpenSinceDate'] = pd.to_datetime(X_transformed['CompetitionOpenSinceDate_str'], errors='coerce')

		X_transformed['CompetitionOpenSince_WasMissing'] = X_transformed['CompetitionOpenSinceDate'].isna().astype(int)
		X_transformed['CompetitionOpenSinceDate'] = X_transformed['CompetitionOpenSinceDate'].fillna(self.earliest_date)

		if not pd.api.types.is_datetime64_any_dtype(X_transformed[self.main_date_col]):
			X_transformed[self.main_date_col] = pd.to_datetime(X_transformed[self.main_date_col])

		X_transformed['CompetitionOpenDurationDays'] = (X_transformed[self.main_date_col] - X_transformed['CompetitionOpenSinceDate']).dt.days
		X_transformed['CompetitionOpenDurationDays'] = X_transformed['CompetitionOpenDurationDays'].apply(lambda x: max(0, x))

		X_transformed['CompetitionOpenDurationMonths'] = np.floor(X_transformed['CompetitionOpenDurationDays'] / 30.437).astype(int)
		X_transformed['CompetitionOpenDurationYears'] = np.floor(X_transformed['CompetitionOpenDurationDays'] / 365.25).astype(int)

		return X_transformed[[
			'CompetitionOpenDurationDays',
			'CompetitionOpenDurationMonths',
			'CompetitionOpenDurationYears',
			'CompetitionOpenSince_WasMissing'
		]]

class ToDataFrameTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)

In [25]:
date_features_col = ['Date']
competition_open_cols = ['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Date']
promo2_since_cols = ['Promo2SinceWeek', 'Promo2SinceYear', 'Date']

original_numerical_features = ['CompetitionDistance', 'DayOfWeek', 'Store']
original_categorical_features = ['StoreType', 'Assortment', 'Open', 'Promo', 'SchoolHoliday', 'Promo2', 'StateHoliday', 'PromoInterval']

original_feats_prep = ColumnTransformer(
    transformers=[
        ('date_feats', Pipeline([('date_extractor', DateFeatureExtractor(date_column='Date'))]), date_features_col),
        ('comp_open_feats', Pipeline([('competition_extractor', CompetitionOpenFeatureExtractor(
            competition_month_col='CompetitionOpenSinceMonth',
            competition_year_col='CompetitionOpenSinceYear',
            main_date_col='Date'
        ))]), competition_open_cols),
        ('promo2_since_feats', Pipeline([('promo2_extractor', Promo2FeatureExtractor(
            promo2_week_col='Promo2SinceWeek',
            promo2_year_col='Promo2SinceYear',
            main_date_col='Date'
        ))]), promo2_since_cols),
        ('orig_num', 'passthrough', original_numerical_features),
        ('orig_cat', 'passthrough', original_categorical_features),
    ],
    remainder='drop', # Drop 'Id' or any other columns not specified
    verbose_feature_names_out=False # Makes output names cleaner for debugging
)


final_numerical_features = [
    'Day', 'Month', 'Year', 'Week', 'IsWeekEnd', 'IsMonthStart', 'IsMonthEnd',
    'CompetitionOpenDurationDays', 'CompetitionOpenDurationMonths', 'CompetitionOpenDurationYears',
    'Promo2DurationDays', 'Promo2DurationMonths', 'Promo2DurationYears',
    'CompetitionDistance', 'DayOfWeek', 'Store'
]
final_categorical_features = [
    'DayName',
    'CompetitionOpenSince_WasMissing', 'Promo2Since_WasMissing',
    'StoreType', 'Assortment', 'Open', 'Promo', 'SchoolHoliday', 'Promo2', 'StateHoliday', 'PromoInterval'
]

preprocessor_for_original_prep = ColumnTransformer(
    transformers=[
        ('scaled_num', StandardScaler(), final_numerical_features),
        ('encoded_cat', OneHotEncoder(handle_unknown='ignore'), final_categorical_features)
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

all_features = [
    # DateFeatureExtractor output
    'Month', 'Year', 'Day', 'Week', 'IsWeekEnd', 'IsMonthStart', 'IsMonthEnd', 'DayName',
    # CompetitionOpenFeatureExtractor output
    'CompetitionOpenDurationDays', 'CompetitionOpenDurationMonths', 'CompetitionOpenDurationYears', 'CompetitionOpenSince_WasMissing',
    # Promo2FeatureExtractor output
    'Promo2DurationDays', 'Promo2DurationMonths', 'Promo2DurationYears', 'Promo2Since_WasMissing',
    # original_numerical_features
    'CompetitionDistance', 'DayOfWeek', 'Store',
    # original_categorical_features
    'StoreType', 'Assortment', 'Open', 'Promo', 'SchoolHoliday', 'Promo2', 'StateHoliday', 'PromoInterval'
]

preprocessor = Pipeline(steps=[
    ('feature_engineering', original_feats_prep),
    ('to_df', ToDataFrameTransformer(columns=all_features)),
    ('final_processing', preprocessor_for_original_prep)
])

In [26]:
preprocessor.fit(X_train)

X_train = preprocessor.transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)

print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"X_test: {X_test.shape}")

X_train: (460618, 47)
X_val: (115155, 47)
X_test: (22090, 47)


In [27]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,1.734384,0.172744,1.307407,0.335141,-0.459686,-0.150288,5.268593,0.261023,0.254488,0.183517,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.734384,0.172744,1.307407,0.335141,-0.459686,-0.150288,5.268593,0.403091,0.396353,0.354853,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.734384,0.172744,1.307407,0.335141,-0.459686,-0.150288,5.268593,0.559133,0.552404,0.526190,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.734384,0.172744,1.307407,0.335141,-0.459686,-0.150288,5.268593,0.091007,0.084251,0.012180,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.734384,0.172744,1.307407,0.335141,-0.459686,-0.150288,5.268593,-0.858287,-0.866242,-0.844502,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460613,-1.489312,0.172744,-1.415836,0.061589,-0.459686,-0.150288,-0.189804,0.830227,0.836133,0.868863,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
460614,-1.489312,0.172744,-1.415836,0.061589,-0.459686,-0.150288,-0.189804,0.531651,0.538217,0.526190,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
460615,-1.489312,0.172744,-1.415836,0.061589,-0.459686,-0.150288,-0.189804,-0.914648,-0.908801,-0.844502,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
460616,-1.489312,0.172744,-1.415836,0.061589,-0.459686,-0.150288,-0.189804,-0.914648,-0.908801,-0.844502,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [28]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,-1.719575,0.469203,1.307407,0.335141,2.175399,6.653873,-0.189804,0.673253,0.680082,0.697526,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.719575,0.469203,1.307407,0.335141,2.175399,6.653873,-0.189804,-0.447454,-0.454834,-0.501829,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-1.719575,0.469203,1.307407,0.335141,2.175399,6.653873,-0.189804,0.758028,0.751014,0.697526,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-1.719575,0.469203,1.307407,0.335141,2.175399,6.653873,-0.189804,-0.574616,-0.582512,-0.673166,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-1.719575,0.469203,1.307407,0.335141,2.175399,6.653873,-0.189804,0.389583,0.382166,0.354853,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22085,0.122536,0.765663,1.307407,0.813857,-0.459686,-0.150288,-0.189804,-0.481457,-0.483207,-0.501829,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
22086,0.122536,0.765663,1.307407,0.813857,-0.459686,-0.150288,-0.189804,-0.425561,-0.426461,-0.501829,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22087,0.122536,0.765663,1.307407,0.813857,-0.459686,-0.150288,-0.189804,0.326235,0.325421,0.354853,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
22088,0.122536,0.765663,1.307407,0.813857,-0.459686,-0.150288,-0.189804,-0.183813,-0.185292,-0.159156,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [30]:
regression_models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(random_state=42),
    "Lasso Regression": Lasso(random_state=42),
    "ElasticNet Regression": ElasticNet(random_state=42),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "AdaBoost Regressor": AdaBoostRegressor(random_state=42),
    "Support Vector Regressor": SVR(),
    "K-Neighbors Regressor": KNeighborsRegressor()
}

In [None]:
def rmspe(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)

    non_zero_mask = (y_true != 0)

    if not np.any(non_zero_mask):
        if np.all(y_pred == 0):
            return 0.0
        else:
            return np.nan

    y_true_filtered = y_true[non_zero_mask]
    y_pred_filtered = y_pred[non_zero_mask]

    squared_percentage_error = ((y_true_filtered - y_pred_filtered) / y_true_filtered) ** 2

    rmspe_value = np.sqrt(np.mean(squared_percentage_error))

    return np.round(rmspe_value * 100, 5)

In [None]:
model_processor = Pipeline(
	steps=[('regressor', None)]
)

result = {}

for model_name, model in regression_models.items():
	model_processor.set_params(regressor=model)
	model_processor.fit(X_train, y_train)

	y_train_pred = model_processor.predict(X_train)
	y_val_pred = model_processor.predict(X_val)

	training_rmspe = rmspe(y_train, y_train_pred)
	validation_rmspe = rmspe(y_val, y_val_pred)

	result[model_name] = {
		'training_rmspe': training_rmspe,
		'validation_rmspe': validation_rmspe
	}

	print(f'{model_name}:\n   RMSPE on Training Score: {training_rmspe}\n   RMSPE on Validation Score: {validation_rmspe}\n')

print(result)

Linear Regression:
   RMSPE on Training Score: 63.73843
   RMSPE on Validation Score: 66.30149

Ridge Regression:
   RMSPE on Training Score: 63.78484
   RMSPE on Validation Score: 66.30945

Lasso Regression:
   RMSPE on Training Score: 63.78208
   RMSPE on Validation Score: 66.38389

ElasticNet Regression:
   RMSPE on Training Score: 63.88903
   RMSPE on Validation Score: 62.93109

Decision Tree Regressor:
   RMSPE on Training Score: 0.0
   RMSPE on Validation Score: 84.82887

