<a href="https://www.kaggle.com/code/averma111/store-sales-numpyrmsle-catboost-lgbm-xgboost?scriptVersionId=126218495" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import  CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


## Creating dataframes for store sales time series


In [3]:
ROOT_PATH = '/kaggle/input/store-sales-time-series-forecasting'
stores = pd.read_csv(ROOT_PATH+'/stores.csv')
train = pd.read_csv(ROOT_PATH+'/train.csv',parse_dates=['date'])
transactions = pd.read_csv(ROOT_PATH+'/transactions.csv')
oil = pd.read_csv(ROOT_PATH+'/oil.csv', parse_dates=['date'])
holidays_events = pd.read_csv(ROOT_PATH+'/holidays_events.csv', parse_dates=['date'])

## Creating the reduced memory function


In [4]:
## https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
def reduce_memory_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024 ** 2 
    print("Memory usage became: ",mem_usg," MB")
    
    return df

## Creating the numpy RMSLE function better than sklear rmsle

In [5]:
def NumPyRMSLE(y_true: list, y_pred: list) -> float:
    n = len(y_true)
    rmsle = np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
    return rmsle

In [6]:
train = reduce_memory_usage(train)

Memory usage of dataframe is 137.37 MB
Memory usage became:  57.238792419433594  MB


## Defining the summary function


In [7]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    return summ

In [8]:
summary('stores',stores)

stores shape: (54, 5)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std
store_nbr,int64,0,54,1,27.5,54,27.5,15.732133
city,object,0,22,Ambato,,Santo Domingo,,
state,object,0,16,Azuay,,Tungurahua,,
type,object,0,5,A,,E,,
cluster,int64,0,17,1,8.5,17,8.481481,4.693395


In [9]:
summary('train',train)

train shape: (3000888, 6)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std
id,int32,0,3000888,0,1500443.5,3000887,1500444.0,866281.891642
date,datetime64[ns],0,1684,2013-01-01 00:00:00,,2017-08-15 00:00:00,,487 days 10:42:40.198518736
store_nbr,int8,0,54,1,27.5,54,27.5,15.585787
family,category,0,33,,,,,
sales,float32,0,379610,0.0,11.0,124717.0,357.7758,1101.997681
onpromotion,int16,0,362,0,0.0,741,2.60277,12.218882


In [10]:
summary('transactions',transactions)

transactions shape: (83488, 3)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std
date,object,0,1682,2013-01-01,,2017-08-15,,
store_nbr,int64,0,54,1,27.0,54,26.939237,15.608204
transactions,int64,0,4993,5,1393.0,8359,1694.602158,963.286644


## Merging the datasets

In [11]:
train = train.merge(stores, on='store_nbr')
train = train.merge(oil, on='date', how='left')
holidays_events = holidays_events.rename(columns={'type': 'holiday_type'})
train = train.merge(holidays_events, on='date', how='left')

## Replacing the null/na values in dataset


In [12]:
train['dcoilwtico'].fillna(method='ffill', inplace=True)
train['transferred'].fillna(False, inplace=True)
train['onpromotion'].fillna(False, inplace=True)

train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year
train['day_of_week'] = train['date'].dt.dayofweek
train['is_weekend'] = train['day_of_week'].isin([5, 6])

In [13]:
train['dcoilwtico'].fillna(method='ffill', inplace=True)
train['transferred'].fillna(False, inplace=True)
train['onpromotion'].fillna(False, inplace=True)

train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year
train['day_of_week'] = train['date'].dt.dayofweek
train['is_weekend'] = train['day_of_week'].isin([5, 6])

## Encoding the categorical columns 

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['city'] = le.fit_transform(train['city'])
train['state'] = le.fit_transform(train['state'])
train['type'] = le.fit_transform(train['type'])
train['family'] = le.fit_transform(train['family'])
train['holiday_type'] = train['holiday_type'].fillna('None')
train['holiday_type'] = le.fit_transform(train['holiday_type'])

## Featuring engineering

In [15]:
features = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster', 'onpromotion', 'day', 'month', 'year', 'day_of_week', 'is_weekend', 'dcoilwtico', 'transferred', 'holiday_type']
target = 'sales'

X = train[features]
y = train[target]

In [16]:
X = reduce_memory_usage(X)

Memory usage of dataframe is 294.20 MB
Memory usage became:  81.55989074707031  MB


# Splitting the dataset for test and train

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2, random_state=52)

## Defining the evaluation function for RMSLE validation

In [22]:
def evaluation(model_str, y_pred, y_pred_train):
    results = {
        'model': model_str,
        'rmsle score': NumPyRMSLE(y_train,y_pred_train) # faster than sklearn rmsle
    }
    return results

## Capturing the RMSLE values for various models

In [None]:
model=CatBoostRegressor(iterations=300, depth=6, learning_rate=0.1, 
loss_function='RMSE',
         rsm = 0.95, 
         border_count = 64, 
         l2_leaf_reg= 3.5, 
         one_hot_max_size=30, 
         use_best_model = True,
         verbose=False,
         random_seed = 502)
model.fit(X_train, y_train,
     eval_set=(X_val, y_val),   
     verbose = False,
     plot=True)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [None]:
y_pred = model.predict(X_val)
y_pred_train = model.predict(X_train)
result = evaluation(model, y_pred, y_pred_train)
result_list.append(result)
df_eval = pd.DataFrame(result_list)
df_eval

## Reading the test data and prepare for prediction

In [None]:
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'])

test['date'] = pd.to_datetime(test['date'])

test = test.merge(stores, on='store_nbr')
test = test.merge(oil, on='date', how='left')
test = test.merge(holidays_events, on='date', how='left')

test['dcoilwtico'].fillna(method='ffill', inplace=True)
test['transferred'].fillna(False, inplace=True)
test['onpromotion'].fillna(False, inplace=True)

test['day'] = test['date'].dt.day
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year
test['day_of_week'] = test['date'].dt.dayofweek
test['is_weekend'] = test['day_of_week'].isin([5, 6])

test['city'] = le.fit_transform(test['city'])
test['state'] = le.fit_transform(test['state'])
test['type'] = le.fit_transform(test['type'])
test['family'] = le.fit_transform(test['family'])
test['holiday_type'] = test['holiday_type'].fillna('None')
test['holiday_type'] = le.fit_transform(test['holiday_type'])

In [None]:
X_test = test[features].copy()

In [None]:
parameters = {'depth'         : [6,8,10],
                  'learning_rate' : [0.01, 0.05, 0.1],
                  'iterations'    : [30, 50, 100]
                 }

## Executing the best model with lowest RMSLE 

In [None]:
model = CatBoostRegressor()
grid = GridSearchCV(estimator=model, param_grid = parameters, cv = 2, n_jobs=-1)
grid.fit(X, y)
sales = grid.predict(X_test)
df_test = pd.DataFrame(data={'id': test['id'],'sales': sales})
df_test.to_csv('submission.csv', index=False)
df_test