## Setup

In [1]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

## Import Data

In [2]:
holidays = pd.read_csv("../datasets/holidays_events.csv")
oil = pd.read_csv("../datasets/oil.csv")
stores = pd.read_csv("../datasets/stores.csv")
transactions = pd.read_csv("../datasets/transactions.csv") 

train = pd.read_csv("../datasets/train.csv")
test = pd.read_csv("../datasets/test.csv")


holidays["date"] = pd.to_datetime(holidays["date"])
oil["date"] = pd.to_datetime(oil["date"])
transactions["date"] = pd.to_datetime(transactions["date"])

train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

## Data Transformations and Pipeline Building 

In [3]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [4]:
from sklearn.model_selection import train_test_split

train.drop(train[(train["date"] >= "01.04.2016") & (train["date"] < "01.05.2016")].index, axis="rows", inplace=True)

X_train, X_test, y_train, y_test = train_test_split(train.drop(["sales"], axis="columns"), train.loc[:, "sales"], test_size=0.1)

### Transactions Based Transformers

In [68]:
from sklearn.base import BaseEstimator, TransformerMixin

class WeekdayTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X.loc[:, "date"] = pd.to_datetime(X["date"])
        X.loc[:, "is_Monday"] = X["date"].apply(lambda x: 1 if x.weekday() == 0 else 0)
        X.loc[:, "is_Thuesday"] = X["date"].apply(lambda x: 1 if x.weekday() == 1 else 0)
        X.loc[:, "is_Wednessday"] = X["date"].apply(lambda x: 1 if x.weekday() == 2 else 0)
        X.loc[:, "is_Thursday"] = X["date"].apply(lambda x: 1 if x.weekday() == 3 else 0)
        X.loc[:, "is_Friday"] = X["date"].apply(lambda x: 1 if x.weekday() == 4 else 0)
        X.loc[:, "is_Saturday"] = X["date"].apply(lambda x: 1 if x.weekday() == 5 else 0)
        X.loc[:, "is_Sunday"] = X["date"].apply(lambda x: 1 if x.weekday() == 6 else 0)
        return X


In [6]:
weekday_transformer = WeekdayTransformer()
weekday_transformer.fit_transform(X_train)

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,is_Saturday,is_Sunday
1969829,1969829,2016-01-14,29,PET SUPPLIES,0,0,0,0,1,0,0,0
16250,16250,2013-01-10,15,HARDWARE,0,0,0,0,1,0,0,0
2246621,2246621,2016-06-17,45,HARDWARE,0,0,0,0,0,1,0,0
2836120,2836120,2017-05-15,36,BABY CARE,0,1,0,0,0,0,0,0
311549,311549,2013-06-24,5,PREPARED FOODS,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1633966,1633966,2015-07-08,6,BOOKS,0,0,0,1,0,0,0,0
1327237,1327237,2015-01-17,49,EGGS,1,0,0,0,0,0,1,0
1416328,1416328,2015-03-08,49,BABY CARE,0,0,0,0,0,0,0,1
2940120,2940120,2017-07-12,53,HOME CARE,13,0,0,1,0,0,0,0


### Oil Based Transformers

In [7]:
class OilTransformer(BaseEstimator, TransformerMixin):

    def __init__ (self, oil_df):
        self.oil = oil_df

    def fit (self, X, y=None):
        oil_interpolated = pd.DataFrame()
        oil_interpolated["date"] = pd.date_range(start="2013-01-01", end="2017-08-15")
        oil_interpolated = oil_interpolated.merge(self.oil, on="date", how="left")
        oil_interpolated["dcoilwtico_interpolated"] = oil_interpolated["dcoilwtico"].interpolate()
        oil_interpolated["dcoilwtico_interpolated"].bfill()

        self.oil = oil_interpolated

        return self

    def transform(self, X, y=None):
        X = X.merge(self.oil.loc[:, ["date", "dcoilwtico_interpolated"]], on="date")
        return X
        

In [8]:
oil_transformer = OilTransformer(oil)
oil_transformer.fit_transform(X_train)

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,is_Saturday,is_Sunday,dcoilwtico_interpolated
0,1969829,2016-01-14,29,PET SUPPLIES,0,0,0,0,1,0,0,0,31.220000
1,16250,2013-01-10,15,HARDWARE,0,0,0,0,1,0,0,0,93.810000
2,2246621,2016-06-17,45,HARDWARE,0,0,0,0,0,1,0,0,48.000000
3,2836120,2017-05-15,36,BABY CARE,0,1,0,0,0,0,0,0,48.860000
4,311549,2013-06-24,5,PREPARED FOODS,0,1,0,0,0,0,0,0,95.070000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699190,1633966,2015-07-08,6,BOOKS,0,0,0,1,0,0,0,0,51.610000
2699191,1327237,2015-01-17,49,EGGS,1,0,0,0,0,0,1,0,48.065000
2699192,1416328,2015-03-08,49,BABY CARE,0,0,0,0,0,0,0,1,49.836667
2699193,2940120,2017-07-12,53,HOME CARE,13,0,0,1,0,0,0,0,45.480000


### Stores Based Transformers

In [9]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

class StoresTransformer(BaseEstimator, TransformerMixin):

    def __init__ (self, stores_df):
        self.stores = stores_df

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.merge(stores, on="store_nbr")

        type_encoder = OrdinalEncoder()
        city_state_encoder = OneHotEncoder(sparse_output=False)

        X["type"] = type_encoder.fit_transform(X[["type"]])
        city_state_encoding_result = pd.DataFrame(city_state_encoder.fit_transform(X[["city", "state"]]), columns=city_state_encoder.get_feature_names_out())
        city_state_encoding_result = city_state_encoding_result.astype("bool")
        X = pd.concat([X, city_state_encoding_result], axis="columns")
        X = X.drop(["state", "city"], axis="columns")
        return X


In [10]:
stores_transformer = StoresTransformer(stores)
stores_transformer.fit_transform(X_train)

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,...,state_Guayas,state_Imbabura,state_Loja,state_Los Rios,state_Manabi,state_Pastaza,state_Pichincha,state_Santa Elena,state_Santo Domingo de los Tsachilas,state_Tungurahua
0,1969829,2016-01-14,29,PET SUPPLIES,0,0,0,0,1,0,...,True,False,False,False,False,False,False,False,False,False
1,16250,2013-01-10,15,HARDWARE,0,0,0,0,1,0,...,False,True,False,False,False,False,False,False,False,False
2,2246621,2016-06-17,45,HARDWARE,0,0,0,0,0,1,...,False,False,False,False,False,False,True,False,False,False
3,2836120,2017-05-15,36,BABY CARE,0,1,0,0,0,0,...,True,False,False,False,False,False,False,False,False,False
4,311549,2013-06-24,5,PREPARED FOODS,0,1,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699190,1633966,2015-07-08,6,BOOKS,0,0,0,1,0,0,...,False,False,False,False,False,False,True,False,False,False
2699191,1327237,2015-01-17,49,EGGS,1,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
2699192,1416328,2015-03-08,49,BABY CARE,0,0,0,0,0,0,...,False,False,False,False,False,False,True,False,False,False
2699193,2940120,2017-07-12,53,HOME CARE,13,0,0,1,0,0,...,False,False,False,False,True,False,False,False,False,False


### Holidays Based Transformer

In [11]:
from numpy import ndarray


class HolidaysTransformer(BaseEstimator, TransformerMixin):

    def __init__ (self, holidays):
        self.holidays = holidays

    @staticmethod
    def holidays_transformations(holidays:pd.DataFrame, df:pd.DataFrame):
        holidays.drop(holidays[holidays["transferred"] == True].index, inplace=True)
        work_days = holidays[holidays["type"] == "Work Day"]
        free_days = holidays[holidays["type"] != "Work Day"]  


        local_holidays = free_days[free_days["locale"] == "Local"]
        regional_holidays = free_days[free_days["locale"] == "Regional"]
        national_holidays = free_days[free_days["locale"] == "National"]

    
        df["is_Holiday"] = 0

        for idx in range(0, local_holidays.shape[0]):
            df.loc[(df["date"] == local_holidays.iloc[idx]["date"]) & 
                (df["city_" + local_holidays.iloc[idx]["locale_name"]] == local_holidays.iloc[idx]["locale_name"]), "is_Holiday"] = 1

        for idx in range(0, regional_holidays.shape[0]):
            df.loc[(df["date"] == regional_holidays.iloc[idx]["date"]) & 
                (df["state_" + regional_holidays.iloc[idx]["locale_name"]] == regional_holidays.iloc[idx]["locale_name"]), "is_Holiday"] = 1

        for idx in range(0, national_holidays.shape[0]):
            df.loc[df["date"] ==national_holidays.iloc[idx]["date"], "is_Holiday"]= 1

        return df
    

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = self.holidays_transformations(self.holidays, X)
        return X


### Outliers Transformer

In [12]:
X_train.drop(X_train[(X_train["date"] >= "01.04.2016") & (X_train["date"] < "01.05.2016")].index, axis="rows")

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,is_Saturday,is_Sunday
1969829,1969829,2016-01-14,29,PET SUPPLIES,0,0,0,0,1,0,0,0
16250,16250,2013-01-10,15,HARDWARE,0,0,0,0,1,0,0,0
2246621,2246621,2016-06-17,45,HARDWARE,0,0,0,0,0,1,0,0
2836120,2836120,2017-05-15,36,BABY CARE,0,1,0,0,0,0,0,0
311549,311549,2013-06-24,5,PREPARED FOODS,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1633966,1633966,2015-07-08,6,BOOKS,0,0,0,1,0,0,0,0
1327237,1327237,2015-01-17,49,EGGS,1,0,0,0,0,0,1,0
1416328,1416328,2015-03-08,49,BABY CARE,0,0,0,0,0,0,0,1
2940120,2940120,2017-07-12,53,HOME CARE,13,0,0,1,0,0,0,0


In [13]:
class OutliersTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.drop(X[(X["date"] >= "01.04.2016") & (X["date"] < "01.05.2016")].index, axis="rows")
        
        return X

### Custom Predictor

In [14]:
temp = train.groupby(["family", "store_nbr"])["sales"].sum().reset_index()
for x, y in temp[temp["sales"] == 0].apply(lambda x : [x["family"], x["store_nbr"]], axis="columns").to_list():
    print (x, y)

BABY CARE 1
BABY CARE 13
BABY CARE 23
BABY CARE 44
BABY CARE 45
BABY CARE 46
BABY CARE 47
BABY CARE 48
BABY CARE 49
BABY CARE 50
BABY CARE 51
BABY CARE 52
BOOKS 9
BOOKS 10
BOOKS 11
BOOKS 12
BOOKS 13
BOOKS 14
BOOKS 15
BOOKS 16
BOOKS 17
BOOKS 18
BOOKS 19
BOOKS 20
BOOKS 21
BOOKS 22
BOOKS 28
BOOKS 29
BOOKS 30
BOOKS 31
BOOKS 32
BOOKS 33
BOOKS 34
BOOKS 35
BOOKS 36
BOOKS 39
BOOKS 40
BOOKS 43
BOOKS 52
BOOKS 54
LADIESWEAR 16
LADIESWEAR 25
LADIESWEAR 28
LADIESWEAR 29
LADIESWEAR 32
LADIESWEAR 33
LADIESWEAR 35
LADIESWEAR 40
LADIESWEAR 43
LADIESWEAR 54
LAWN AND GARDEN 14
LAWN AND GARDEN 30
LAWN AND GARDEN 54


In [74]:
from sklearn.base import RegressorMixin
from sklearn.ensemble import RandomForestRegressor

class SplittedPredictor(RegressorMixin, BaseEstimator):

    def __init__ (self, dfx):
        self.dfx = dfx
        self.families = list(dfx["family"].unique())
        self.models = {}
        for family in self.families:
            self.models[family] = RandomForestRegressor()

        self.store_without_items = None

    def __split_dataframe(self, X:pd.DataFrame, y:pd.DataFrame=None) -> dict | tuple[dict] :        

        if y is None:
            splitted_dataframes = {}
            for family in self.families:
                splitted_dataframes[family] = X[X["family"] == family]

            return splitted_dataframes
    
        target = pd.DataFrame({"target" : y})
        temp = pd.concat([X, target], axis=1)
       
        for family, store_nbr in self.store_without_items:
            temp.drop(temp[(temp["family"] == family) & (temp["store_nbr"] == store_nbr)].index, axis="rows", inplace=True)
       
        splitted_dataframes_train = {}
        splitted_dataframes_target = {}
       
        for family in self.families :
            splitted_dataframes_train[family] = temp[temp["family"] == family].drop(["target"], axis="columns")
            splitted_dataframes_target[family] = temp[temp["family"] == family].loc[:, "target"]
       
        return splitted_dataframes_train, splitted_dataframes_target

    def __find_stores_without_items(self, X:pd.DataFrame, y:pd.DataFrame) -> list:
        target = pd.DataFrame({"target" : y})
        temp = pd.concat([X, target], axis=1)
        aggregated_train = temp.groupby(["family", "store_nbr"])["target"].sum().reset_index()
        stores_without_items = aggregated_train[aggregated_train["target"] == 0].apply(lambda x : [x["family"], x["store_nbr"]], axis="columns").to_list()
        
        return stores_without_items

        

    def fit(self, X, y=None):
        self.store_without_items = self.__find_stores_without_items(X, y)
        splitted_dataframes_train, splitted_dataframes_target = self.__split_dataframe(X, y)
        
        for family in self.families:
            self.models[family].fit(splitted_dataframes_train[family].drop(["family", "date"], axis="columns"), splitted_dataframes_target[family].to_numpy())
    
        return self
            
    def predict(self, X):

        splitted_X = self.__split_dataframe(X)
        splitted_predictions = {}

        result = None
        
        for family in self.families:
            if (type(splitted_X[family]) == pd.DataFrame):
                splitted_predictions[family] = self.models[family].predict(splitted_X[family].drop(["family", "date"], axis="columns"))
                if result is None:
                    result = pd.concat([splitted_X[family], pd.DataFrame( {"target" : splitted_predictions[family] }, index=splitted_X[family].index)], axis=1)
                else : result = pd.concat([result, pd.concat([splitted_X[family], pd.DataFrame( {"target" : splitted_predictions[family] }, index=splitted_X[family].index)] , axis=1)],  axis=0)

        for family, store_nbr in self.store_without_items:
            result.loc[((result["family"] == family) & (result["store_nbr"] == store_nbr)), "target"] = 0

        
        return result

In [69]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
                ("weekdays", WeekdayTransformer()), 
                ("oil", OilTransformer(oil)),
                ("stores", StoresTransformer(stores)),
                ("holidays", HolidaysTransformer(holidays))
])

x = pipeline.fit_transform(X_train)
x

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,...,state_Imbabura,state_Loja,state_Los Rios,state_Manabi,state_Pastaza,state_Pichincha,state_Santa Elena,state_Santo Domingo de los Tsachilas,state_Tungurahua,is_Holiday
0,1969829,2016-01-14,29,PET SUPPLIES,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,False,0
1,16250,2013-01-10,15,HARDWARE,0,0,0,0,1,0,...,True,False,False,False,False,False,False,False,False,0
2,2246621,2016-06-17,45,HARDWARE,0,0,0,0,0,1,...,False,False,False,False,False,True,False,False,False,0
3,2836120,2017-05-15,36,BABY CARE,0,1,0,0,0,0,...,False,False,False,False,False,False,False,False,False,0
4,311549,2013-06-24,5,PREPARED FOODS,0,1,0,0,0,0,...,False,False,False,False,False,False,False,True,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2699190,1633966,2015-07-08,6,BOOKS,0,0,0,1,0,0,...,False,False,False,False,False,True,False,False,False,0
2699191,1327237,2015-01-17,49,EGGS,1,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,0
2699192,1416328,2015-03-08,49,BABY CARE,0,0,0,0,0,0,...,False,False,False,False,False,True,False,False,False,0
2699193,2940120,2017-07-12,53,HOME CARE,13,0,0,1,0,0,...,False,False,False,True,False,False,False,False,False,0


In [None]:
predator = SplittedPredictor(X_train)
predator.fit(X_train, y_train)