## Setup

In [4]:
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

## Import Data

In [2]:
holidays = pd.read_csv("../datasets/holidays_events.csv")
oil = pd.read_csv("../datasets/oil.csv")
stores = pd.read_csv("../datasets/stores.csv")
transactions = pd.read_csv("../datasets/transactions.csv") 

train = pd.read_csv("../datasets/train.csv")
test = pd.read_csv("../datasets/test.csv")


holidays["date"] = pd.to_datetime(holidays["date"])
oil["date"] = pd.to_datetime(oil["date"])
transactions["date"] = pd.to_datetime(transactions["date"])

train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

## Data Transformations and Pipeline Building 

In [4]:
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train.drop(["sales"], axis="columns"), train.loc[:, "sales"], test_size=0.1)

### Transactions Based Transformers

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class WeekdayTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X["date"] = pd.to_datetime(X["date"])
        X["is_Monday"] = X["date"].apply(lambda x: 1 if x.weekday() == 0 else 0)
        X["is_Thuesday"] = X["date"].apply(lambda x: 1 if x.weekday() == 1 else 0)
        X["is_Wednessday"] = X["date"].apply(lambda x: 1 if x.weekday() == 2 else 0)
        X["is_Thursday"] = X["date"].apply(lambda x: 1 if x.weekday() == 3 else 0)
        X["is_Friday"] = X["date"].apply(lambda x: 1 if x.weekday() == 4 else 0)
        X["is_Saturday"] = X["date"].apply(lambda x: 1 if x.weekday() == 5 else 0)
        X["is_Sunday"] = X["date"].apply(lambda x: 1 if x.weekday() == 6 else 0)
        return X


In [7]:
weekday_transformer = WeekdayTransformer()
weekday_transformer.fit_transform(X_train)

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,is_Saturday,is_Sunday
2156162,2156162,2016-04-27,8,DAIRY,41,0,0,1,0,0,0,0
1162800,1162800,2014-10-16,35,GROCERY I,4,0,0,0,1,0,0,0
1821526,1821526,2015-10-22,18,PERSONAL CARE,0,0,0,0,1,0,0,0
1055455,1055455,2014-08-17,23,HOME AND KITCHEN II,0,0,0,0,0,0,0,1
2869316,2869316,2017-06-03,17,SEAFOOD,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1101318,1101318,2014-09-12,10,DELI,0,0,0,0,0,1,0,0
1918974,1918974,2015-12-15,51,MEATS,0,0,1,0,0,0,0,0
260923,260923,2013-05-27,3,PERSONAL CARE,0,1,0,0,0,0,0,0
1779061,1779061,2015-09-28,26,SCHOOL AND OFFICE SUPPLIES,0,1,0,0,0,0,0,0


### Oil Based Transformers

In [8]:
class OilTransformer(BaseEstimator, TransformerMixin):

    def __init__ (self, oil_df):
        self.oil = oil_df

    def fit (self, X, y=None):
        oil_interpolated = pd.DataFrame()
        oil_interpolated["date"] = pd.date_range(start="2013-01-01", end="2017-08-15")
        oil_interpolated = oil_interpolated.merge(self.oil, on="date", how="left")
        oil_interpolated["dcoilwtico_interpolated"] = oil_interpolated["dcoilwtico"].interpolate()
        oil_interpolated["dcoilwtico_interpolated"].bfill()

        self.oil = oil_interpolated

        return self

    def transform(self, X, y=None):
        X = X.merge(self.oil.loc[:, ["date", "dcoilwtico_interpolated"]], on="date")
        return X
        

In [9]:
oil_transformer = OilTransformer(oil)
oil_transformer.fit_transform(X_train)

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,is_Saturday,is_Sunday,dcoilwtico_interpolated
0,2156162,2016-04-27,8,DAIRY,41,0,0,1,0,0,0,0,45.290000
1,1162800,2014-10-16,35,GROCERY I,4,0,0,0,1,0,0,0,82.330000
2,1821526,2015-10-22,18,PERSONAL CARE,0,0,0,0,1,0,0,0,44.900000
3,1055455,2014-08-17,23,HOME AND KITCHEN II,0,0,0,0,0,0,0,1,96.726667
4,2869316,2017-06-03,17,SEAFOOD,1,0,0,0,0,0,1,0,47.586667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2700794,1101318,2014-09-12,10,DELI,0,0,0,0,0,1,0,0,92.180000
2700795,1918974,2015-12-15,51,MEATS,0,0,1,0,0,0,0,0,37.320000
2700796,260923,2013-05-27,3,PERSONAL CARE,0,1,0,0,0,0,0,0,94.447500
2700797,1779061,2015-09-28,26,SCHOOL AND OFFICE SUPPLIES,0,1,0,0,0,0,0,0,44.400000


### Stores Based Transformers

In [10]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

class StoresTransformer(BaseEstimator, TransformerMixin):

    def __init__ (self, stores_df):
        self.stores = stores_df

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.merge(stores, on="store_nbr")

        type_encoder = OrdinalEncoder()
        city_state_encoder = OneHotEncoder(sparse_output=False)

        X["type"] = type_encoder.fit_transform(X[["type"]])
        city_state_encoding_result = pd.DataFrame(city_state_encoder.fit_transform(X[["city", "state"]]), columns=city_state_encoder.get_feature_names_out())
        X = pd.concat([X, city_state_encoding_result], axis="columns")
        X = X.drop(["state", "city"], axis="columns")
        return X


In [11]:
stores_transformer = StoresTransformer(stores)
stores_transformer.fit_transform(X_train)

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,...,state_Guayas,state_Imbabura,state_Loja,state_Los Rios,state_Manabi,state_Pastaza,state_Pichincha,state_Santa Elena,state_Santo Domingo de los Tsachilas,state_Tungurahua
0,2156162,2016-04-27,8,DAIRY,41,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1162800,2014-10-16,35,GROCERY I,4,0,0,0,1,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1821526,2015-10-22,18,PERSONAL CARE,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1055455,2014-08-17,23,HOME AND KITCHEN II,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2869316,2017-06-03,17,SEAFOOD,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2700794,1101318,2014-09-12,10,DELI,0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2700795,1918974,2015-12-15,51,MEATS,0,0,1,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2700796,260923,2013-05-27,3,PERSONAL CARE,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2700797,1779061,2015-09-28,26,SCHOOL AND OFFICE SUPPLIES,0,1,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Holidays Based Transformer

In [12]:
from numpy import ndarray


class HolidaysTransformer(BaseEstimator, TransformerMixin):

    def __init__ (self, holidays):
        self.holidays = holidays

    @staticmethod
    def holidays_transformations(holidays:pd.DataFrame, df:pd.DataFrame):
        holidays.drop(holidays[holidays["transferred"] == True].index, inplace=True)
        work_days = holidays[holidays["type"] == "Work Day"]
        free_days = holidays[holidays["type"] != "Work Day"]  


        local_holidays = free_days[free_days["locale"] == "Local"]
        regional_holidays = free_days[free_days["locale"] == "Regional"]
        national_holidays = free_days[free_days["locale"] == "National"]

    
        df["is_Holiday"] = 0

        for idx in range(0, local_holidays.shape[0]):
            df.loc[(df["date"] == local_holidays.iloc[idx]["date"]) & 
                (df["city_" + local_holidays.iloc[idx]["locale_name"]] == local_holidays.iloc[idx]["locale_name"]), "is_Holiday"] = 1

        for idx in range(0, regional_holidays.shape[0]):
            df.loc[(df["date"] == regional_holidays.iloc[idx]["date"]) & 
                (df["state_" + regional_holidays.iloc[idx]["locale_name"]] == regional_holidays.iloc[idx]["locale_name"]), "is_Holiday"] = 1

        for idx in range(0, national_holidays.shape[0]):
            df.loc[df["date"] ==national_holidays.iloc[idx]["date"], "is_Holiday"]= 1

        return df
    

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = self.holidays_transformations(self.holidays, X)
        return X


### Outliers Transformer

In [13]:
X_train.drop(X_train[(X_train["date"] >= "01.04.2016") & (X_train["date"] < "01.05.2016")].index, axis="rows")
        

Unnamed: 0,id,date,store_nbr,family,onpromotion,is_Monday,is_Thuesday,is_Wednessday,is_Thursday,is_Friday,is_Saturday,is_Sunday
2156162,2156162,2016-04-27,8,DAIRY,41,0,0,1,0,0,0,0
1162800,1162800,2014-10-16,35,GROCERY I,4,0,0,0,1,0,0,0
1821526,1821526,2015-10-22,18,PERSONAL CARE,0,0,0,0,1,0,0,0
1055455,1055455,2014-08-17,23,HOME AND KITCHEN II,0,0,0,0,0,0,0,1
2869316,2869316,2017-06-03,17,SEAFOOD,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1101318,1101318,2014-09-12,10,DELI,0,0,0,0,0,1,0,0
1918974,1918974,2015-12-15,51,MEATS,0,0,1,0,0,0,0,0
260923,260923,2013-05-27,3,PERSONAL CARE,0,1,0,0,0,0,0,0
1779061,1779061,2015-09-28,26,SCHOOL AND OFFICE SUPPLIES,0,1,0,0,0,0,0,0


In [14]:
class OutliersTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.drop(X[(X["date"] >= "01.04.2016") & (X["date"] < "01.05.2016")].index, axis="rows")
        
        return X

### Custom Predictor

In [15]:
from sklearn.base import RegressorMixin
from sklearn.ensemble import RandomForestRegressor

class SplittedPredictorVSkobochkachPredator(BaseEstimator, RegressorMixin):

    def __init__ (self, df):
        self.families = df["family"].unique()
        self.models = {}
        for family in self.families:
            self.models[family] = RandomForestRegressor()

    def __split_dataframe(self, X, withtarget=False):
        splitted_dataframes_train = {}
        splitted_dataframes_target = {}

        if withtarget:
            for family in self.families:
            splitted_dataframes[family] = X[X["family"] == family].drop(["target"], axis="columns")
            splitted

        
        for family in self.families:
            splitted_dataframes[family] = X[X["family"] == family]

        

        return splitted_dataframes
        

    def fit(self, X, y=None):

        target = pd.DataFrame(y, columns=["target"])
        temp = pd.concat([X, y], axis=1)

        splitted_dfs = self.__split_dataframe(X)

        for family in self.families:

            

    def predict(self, X):
        pass

IndentationError: expected an indented block after 'for' statement on line 17 (1602596371.py, line 18)

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
                ("outliers", OutliersTransformer()), 
                ("weekdays", WeekdayTransformer()), 
                ("oil", OilTransformer(oil)),
                ("stores", StoresTransformer(stores)),
                ("holidays", HolidaysTransformer(holidays))
])

x = pipeline.fit_transform(X_train)
x

{'HOME APPLIANCES':               id       date  store_nbr           family  onpromotion  \
 0        1587977 2015-06-13         15  HOME APPLIANCES            0   
 6         955268 2014-06-22         12  HOME APPLIANCES            0   
 17        606491 2013-12-07         26  HOME APPLIANCES            0   
 39       2726180 2017-03-14         50  HOME APPLIANCES            0   
 45       1631009 2015-07-07         22  HOME APPLIANCES            0   
 ...          ...        ...        ...              ...          ...   
 2698969  2552336 2016-12-06         23  HOME APPLIANCES            0   
 2699012    57008 2013-02-01          9  HOME APPLIANCES            0   
 2699028   752516 2014-02-28         23  HOME APPLIANCES            0   
 2699043  2020706 2016-02-11          7  HOME APPLIANCES            0   
 2699155  2358098 2016-08-19         23  HOME APPLIANCES            0   
 
          is_Monday  is_Thuesday  is_Wednessday  is_Thursday  is_Friday  ...  \
 0                0    