## Setup

In [1]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

from os.path import exists
import pickle

## Import Data

In [2]:
holidays = pd.read_csv("../datasets/holidays_events.csv")
oil = pd.read_csv("../datasets/oil.csv")
stores = pd.read_csv("../datasets/stores.csv")
transactions = pd.read_csv("../datasets/transactions.csv") 

train = pd.read_csv("../datasets/train.csv")
test = pd.read_csv("../datasets/test.csv")


holidays["date"] = pd.to_datetime(holidays["date"])
oil["date"] = pd.to_datetime(oil["date"])
transactions["date"] = pd.to_datetime(transactions["date"])

train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

## Data Transformations and Pipeline Building 

In [3]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [4]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [5]:
from sklearn.model_selection import train_test_split

train.drop(train[(train["date"] >= "01.04.2016") & (train["date"] < "01.05.2016")].index, axis="rows", inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train.drop(["sales"], axis="columns"), train.loc[:, "sales"], test_size=0.1)

### Transactions Based Transformers

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class WeekdayTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X.loc[:, "date"] = pd.to_datetime(X["date"])
        X.loc[:, "is_Monday"] = X["date"].apply(lambda x: 1 if x.weekday() == 0 else 0)
        X.loc[:, "is_Thuesday"] = X["date"].apply(lambda x: 1 if x.weekday() == 1 else 0)
        X.loc[:, "is_Wednessday"] = X["date"].apply(lambda x: 1 if x.weekday() == 2 else 0)
        X.loc[:, "is_Thursday"] = X["date"].apply(lambda x: 1 if x.weekday() == 3 else 0)
        X.loc[:, "is_Friday"] = X["date"].apply(lambda x: 1 if x.weekday() == 4 else 0)
        X.loc[:, "is_Saturday"] = X["date"].apply(lambda x: 1 if x.weekday() == 5 else 0)
        X.loc[:, "is_Sunday"] = X["date"].apply(lambda x: 1 if x.weekday() == 6 else 0)
        return X


In [7]:
weekday_transformer = WeekdayTransformer()
weekday_transformer.fit_transform(X_train)

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,is_Saturday,is_Sunday
875956,875956,2014-05-08,37,BOOKS,0,0,0,0,1,0,0,0
1335399,1335399,2015-01-22,28,LINGERIE,0,0,0,0,1,0,0,0
2635770,2635770,2017-01-23,14,PLAYERS AND ELECTRONICS,0,1,0,0,0,0,0,0
1312882,1312882,2015-01-09,46,EGGS,28,0,0,0,0,1,0,0
2793017,2793017,2017-04-21,26,PREPARED FOODS,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2130953,2130953,2016-04-13,5,FROZEN FOODS,2,0,0,1,0,0,0,0
327681,327681,2013-07-03,52,MEATS,0,0,0,1,0,0,0,0
610848,610848,2013-12-09,48,HOME CARE,0,1,0,0,0,0,0,0
879997,879997,2014-05-10,5,LADIESWEAR,0,0,0,0,0,0,1,0


### Oil Based Transformers

In [8]:
class OilTransformer(BaseEstimator, TransformerMixin):

    def __init__ (self, oil_df):
        self.oil = oil_df

    def fit (self, X, y=None):
        oil_interpolated = pd.DataFrame()
        oil_interpolated["date"] = pd.date_range(start="2013-01-01", end="2017-08-31")
        oil_interpolated = oil_interpolated.merge(self.oil, on="date", how="left")
        oil_interpolated["dcoilwtico_interpolated"] = oil_interpolated["dcoilwtico"].interpolate()
        oil_interpolated["dcoilwtico_interpolated"] = oil_interpolated["dcoilwtico_interpolated"].bfill()

        self.oil = oil_interpolated

        return self

    def transform(self, X, y=None):
        X = X.merge(self.oil.loc[:, ["date", "dcoilwtico_interpolated"]], on="date")
        return X
        

In [9]:
oil_transformer = OilTransformer(oil)
oil_transformer.fit_transform(X_train)

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,is_Saturday,is_Sunday,dcoilwtico_interpolated
0,875956,2014-05-08,37,BOOKS,0,0,0,0,1,0,0,0,100.52
1,1335399,2015-01-22,28,LINGERIE,0,0,0,0,1,0,0,0,45.93
2,2635770,2017-01-23,14,PLAYERS AND ELECTRONICS,0,1,0,0,0,0,0,0,52.77
3,1312882,2015-01-09,46,EGGS,28,0,0,0,0,1,0,0,48.35
4,2793017,2017-04-21,26,PREPARED FOODS,0,0,0,0,0,1,0,0,49.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699190,2130953,2016-04-13,5,FROZEN FOODS,2,0,0,1,0,0,0,0,41.70
2699191,327681,2013-07-03,52,MEATS,0,0,0,1,0,0,0,0,101.92
2699192,610848,2013-12-09,48,HOME CARE,0,1,0,0,0,0,0,0,97.10
2699193,879997,2014-05-10,5,LADIESWEAR,0,0,0,0,0,0,1,0,100.51


### Stores Based Transformers

In [10]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

class StoresTransformer(BaseEstimator, TransformerMixin):

    def __init__ (self, stores_df):
        self.stores = stores_df

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.merge(stores, on="store_nbr")

        type_encoder = OrdinalEncoder()
        city_state_encoder = OneHotEncoder(sparse_output=False)

        X["type"] = type_encoder.fit_transform(X[["type"]])
        city_state_encoding_result = pd.DataFrame(city_state_encoder.fit_transform(X[["city", "state"]]), columns=city_state_encoder.get_feature_names_out())
        city_state_encoding_result = city_state_encoding_result.astype("bool")
        X = pd.concat([X, city_state_encoding_result], axis="columns")
        X = X.drop(["state", "city"], axis="columns")
        return X


In [11]:
stores_transformer = StoresTransformer(stores)
stores_transformer.fit_transform(X_train)

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,...,state_Guayas,state_Imbabura,state_Loja,state_Los Rios,state_Manabi,state_Pastaza,state_Pichincha,state_Santa Elena,state_Santo Domingo de los Tsachilas,state_Tungurahua
0,875956,2014-05-08,37,BOOKS,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,False
1,1335399,2015-01-22,28,LINGERIE,0,0,0,0,1,0,...,True,False,False,False,False,False,False,False,False,False
2,2635770,2017-01-23,14,PLAYERS AND ELECTRONICS,0,1,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
3,1312882,2015-01-09,46,EGGS,28,0,0,0,0,1,...,False,False,False,False,False,False,True,False,False,False
4,2793017,2017-04-21,26,PREPARED FOODS,0,0,0,0,0,1,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699190,2130953,2016-04-13,5,FROZEN FOODS,2,0,0,1,0,0,...,False,False,False,False,False,False,False,False,True,False
2699191,327681,2013-07-03,52,MEATS,0,0,0,1,0,0,...,False,False,False,False,True,False,False,False,False,False
2699192,610848,2013-12-09,48,HOME CARE,0,1,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
2699193,879997,2014-05-10,5,LADIESWEAR,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False


### Holidays Based Transformer

In [12]:
from numpy import ndarray


class HolidaysTransformer(BaseEstimator, TransformerMixin):

    def __init__ (self, holidays):
        self.holidays = holidays

    @staticmethod
    def holidays_transformations(holidays:pd.DataFrame, df:pd.DataFrame):
        holidays.drop(holidays[holidays["transferred"] == True].index, inplace=True)
        free_days = holidays[holidays["type"] != "Work Day"]  


        local_holidays = free_days[free_days["locale"] == "Local"]
        regional_holidays = free_days[free_days["locale"] == "Regional"]
        national_holidays = free_days[free_days["locale"] == "National"]

    
        df["is_Holiday"] = 0

        for idx in range(0, local_holidays.shape[0]):
            df.loc[(df["date"] == local_holidays.iloc[idx]["date"]) & 
                (df["city_" + local_holidays.iloc[idx]["locale_name"]] == local_holidays.iloc[idx]["locale_name"]), "is_Holiday"] = 1

        for idx in range(0, regional_holidays.shape[0]):
            df.loc[(df["date"] == regional_holidays.iloc[idx]["date"]) & 
                (df["state_" + regional_holidays.iloc[idx]["locale_name"]] == regional_holidays.iloc[idx]["locale_name"]), "is_Holiday"] = 1

        for idx in range(0, national_holidays.shape[0]):
            df.loc[df["date"] ==national_holidays.iloc[idx]["date"], "is_Holiday"]= 1

        return df
    

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = self.holidays_transformations(self.holidays, X)
        return X


### Custom Predictor

In [13]:
from sklearn.base import RegressorMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils.validation import check_is_fitted

class SplittedPredictor(RegressorMixin, BaseEstimator):

    def __init__ (self, n_estimators=100, criterion="squared_error", max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.families = None
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.models = {}
        self.store_without_items = None


    def __sklearn_is_fitted__(self):
        return hasattr(self, "_is_fitted") and self._is_fitted
        

    def split_dataframe(self, X:pd.DataFrame, y:pd.DataFrame=None) -> dict | tuple[dict] :        

        if y is None:
            splitted_dataframes = {}
            for family in self.families:
                splitted_dataframes[family] = X[X["family"] == family]
            return splitted_dataframes


        X_dropped = X.copy()
        
        for family, store_nbr in self.store_without_items:
            X_dropped.drop(X_dropped[(X_dropped["family"] == family) & (X_dropped["store_nbr"] == store_nbr)].index, axis="rows", inplace=True)

        splitted_dataframes_train = {}
        splitted_dataframes_target = {}

        for family in self.families :
            splitted_dataframes_train[family] = X_dropped[X_dropped["family"] == family]
            splitted_dataframes_target[family] = y.loc[X_dropped[X_dropped["family"] == family].index]

        return splitted_dataframes_train, splitted_dataframes_target

    def find_stores_without_items(self, X:pd.DataFrame, y:pd.DataFrame) -> list:
        target = pd.DataFrame({"target" : y})
        X = X.sort_index()
        temp = pd.concat([X, target], axis=1)
        aggregated_train = temp.groupby(["family", "store_nbr"])["target"].sum().reset_index()
        stores_without_items = aggregated_train[aggregated_train["target"] == 0].apply(lambda x : [x["family"], x["store_nbr"]], axis="columns").to_list()

        
        return stores_without_items
    

    def set_params(self, **params):
        if not params:
            return self

        for key, value in params.items():
            if hasattr(self, key):
                setattr(self, key, value)
            else:
                self.kwargs[key] = value


        if "families" in params:
            self.families = params["families"]
        
        if "store_without_items" in params:
            self.store_without_items = params["store_without_items"]

        self.models = {}
        self.store_without_items = None
        self.families = None
                
        return self


    def fit(self, X, y=None):

        X = X.sort_values(by="id")
        y = y.sort_index()
        self.families = list(X["family"].unique())
        for family in self.families:
            self.models[family] = RandomForestRegressor(n_estimators=self.n_estimators,
                                                        criterion=self.criterion, 
                                                        max_depth=self.max_depth,
                                                        min_samples_split=self.min_samples_split,
                                                        min_samples_leaf=self.min_samples_leaf)

        self.store_without_items = self.find_stores_without_items(X, y)
        splitted_dataframes_train, splitted_dataframes_target = self.split_dataframe(X, y)
        
        for family in self.families:
            if len(splitted_dataframes_train[family]) > 0:
                self.models[family].fit(splitted_dataframes_train[family].drop(["family"], axis="columns"), splitted_dataframes_target[family].to_numpy())
                print(f"Fit for {family} complited")



        self._is_fitted = True
        print("Fit complited")
        return self
            
    def predict(self, X):
        check_is_fitted(self)
        

        splitted_X = self.split_dataframe(X)
        splitted_predictions = {}

        result = None
        
        for family in self.families:
            if hasattr(self.models[family], 'estimators_') and len(self.models[family].estimators_) > 0:
                splitted_predictions[family] = self.models[family].predict(splitted_X[family].drop(["family"], axis="columns"))                
            else:
                splitted_predictions[family] = np.zeros(splitted_X[family].shape[0])    
            if result is None:
                result = pd.concat([splitted_X[family], pd.DataFrame( {"target" : splitted_predictions[family] }, index=splitted_X[family].index)], axis=1)
            else : result = pd.concat([result, pd.concat([splitted_X[family], pd.DataFrame( {"target" : splitted_predictions[family] }, index=splitted_X[family].index)] , axis=1)],  axis=0)

        for family, store_nbr in self.store_without_items:
            result.loc[((result["family"] == family) & (result["store_nbr"] == store_nbr)), "target"] = 0

        
        return result.sort_index().loc[:, "target"]

In [14]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
                ("weekdays", WeekdayTransformer()), 
                ("oil", OilTransformer(oil)),
                ("stores", StoresTransformer(stores)),
                ("holidays", HolidaysTransformer(holidays))
])

x = pipeline.fit_transform(pd.concat([X_train, y_train], axis=1))
x

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,...,state_Imbabura,state_Loja,state_Los Rios,state_Manabi,state_Pastaza,state_Pichincha,state_Santa Elena,state_Santo Domingo de los Tsachilas,state_Tungurahua,is_Holiday
0,875956,2014-05-08,37,BOOKS,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,0
1,1335399,2015-01-22,28,LINGERIE,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,0
2,2635770,2017-01-23,14,PLAYERS AND ELECTRONICS,0,1,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0
3,1312882,2015-01-09,46,EGGS,28,0,0,0,0,1,...,False,False,False,False,False,True,False,False,False,0
4,2793017,2017-04-21,26,PREPARED FOODS,0,0,0,0,0,1,...,False,False,False,False,False,False,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699190,2130953,2016-04-13,5,FROZEN FOODS,2,0,0,1,0,0,...,False,False,False,False,False,False,False,True,False,0
2699191,327681,2013-07-03,52,MEATS,0,0,0,1,0,0,...,False,False,False,True,False,False,False,False,False,0
2699192,610848,2013-12-09,48,HOME CARE,0,1,0,0,0,0,...,False,False,False,False,False,True,False,False,False,0
2699193,879997,2014-05-10,5,LADIESWEAR,0,0,0,0,0,0,...,False,False,False,False,False,False,False,True,False,1


## Hyperparametr tuning

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from sklearn.metrics import r2_score

distributions = {
                 "n_estimators" : sp_randint(50, 1000), 
                 "criterion" : ["squared_error", "absolute_error", "friedman_mse", "poisson"], 
                 "max_depth" : sp_randint(10, 1000),
                 "min_samples_split" : sp_randint(2, 5),
                 "min_samples_leaf" : sp_randint(1, 5),
                 }

sc = SplittedPredictor()

ramdom_search = RandomizedSearchCV(sc, param_distributions=distributions, n_iter=5, cv=3, n_jobs=4)
ramdom_search

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(["sales"], axis="columns"), train.loc[:, "sales"], test_size=0.2)
x_test = pipeline.fit_transform(pd.concat([X_test, y_test], axis=1))
tempp = OrdinalEncoder()
x_test["family"] = tempp.fit_transform(x_test[["family"]]).astype(int)
x_test = x_test.set_index("id").sort_index()

In [17]:
ramdom_search.fit(x_test.drop(["sales", "date"], axis="columns"), x_test["sales"])   

In [None]:
ramdom_search.best_score_

0.7856423282905228

After Random search (on another pc) best params:

- 'criterion': 'squared_error',
- 'max_depth': 294,
- 'min_samples_leaf': 3,
- 'min_samples_split': 3,
- 'n_estimators': 93

Best score : 0.8062215737383248

## Final Model

In [19]:
from sklearn.preprocessing import OrdinalEncoder

X_train_pr = x.drop(["sales", "date"] , axis="columns")
y_train_pr = x.loc[:, "sales"]

predator = SplittedPredictor(criterion="squared_error", max_depth=300, n_estimators=100, min_samples_leaf=3, min_samples_split=3)
predator.fit(X_train_pr, y_train_pr)

Fit for AUTOMOTIVE complited
Fit for BABY CARE complited
Fit for BEAUTY complited
Fit for BOOKS complited
Fit for CELEBRATION complited
Fit for CLEANING complited
Fit for DAIRY complited
Fit for DELI complited
Fit for EGGS complited
Fit for FROZEN FOODS complited
Fit for GROCERY I complited
Fit for GROCERY II complited
Fit for HARDWARE complited
Fit for HOME AND KITCHEN I complited
Fit for HOME AND KITCHEN II complited
Fit for HOME APPLIANCES complited
Fit for HOME CARE complited
Fit for LADIESWEAR complited
Fit for LIQUOR,WINE,BEER complited
Fit for MAGAZINES complited
Fit for MEATS complited
Fit for PET SUPPLIES complited
Fit for PLAYERS AND ELECTRONICS complited
Fit for POULTRY complited
Fit for PREPARED FOODS complited
Fit for PRODUCE complited
Fit for SCHOOL AND OFFICE SUPPLIES complited
Fit for SEAFOOD complited
Fit for BREAD/BAKERY complited
Fit for LAWN AND GARDEN complited
Fit for LINGERIE complited
Fit for PERSONAL CARE complited
Fit for BEVERAGES complited
Fit complited


In [20]:
test_df = pd.read_csv("../datasets/test.csv")
test_df["date"] = pd.to_datetime(test_df["date"])
predictions_test = predator.predict(pipeline.fit_transform(test_df).drop("date", axis="columns"))
predictions_test

0           4.133982
1           0.000000
2           6.271463
3        2257.680761
4           0.054762
            ...     
28507     359.215600
28508      96.606957
28509    1158.619960
28510     148.565144
28511      15.125420
Name: target, Length: 28512, dtype: float64

In [25]:
predictions_test.index = test_df["id"]
predictions_test

id
3000888       4.133982
3000889       0.000000
3000890       6.271463
3000891    2257.680761
3000892       0.054762
              ...     
3029395     359.215600
3029396      96.606957
3029397    1158.619960
3029398     148.565144
3029399      15.125420
Name: target, Length: 28512, dtype: float64

In [26]:
predictions_test.to_csv("submit.csv") #score 0.49732