<a href="https://www.kaggle.com/code/averma111/store-sales-numpyrmsle-catboost-lgbm-xgboost?scriptVersionId=126200262" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [60]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import  CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


## Creating dataframes for store sales time series


In [46]:
ROOT_PATH = '/kaggle/input/store-sales-time-series-forecasting'
stores = pd.read_csv(ROOT_PATH+'/stores.csv')
train = pd.read_csv(ROOT_PATH+'/train.csv',parse_dates=['date'])
transactions = pd.read_csv(ROOT_PATH+'/transactions.csv')
oil = pd.read_csv(ROOT_PATH+'/oil.csv', parse_dates=['date'])
holidays_events = pd.read_csv(ROOT_PATH+'/holidays_events.csv', parse_dates=['date'])

## Creating the reduced memory function


In [47]:
## https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65
def reduce_memory_usage(df):
    
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024 ** 2 
    print("Memory usage became: ",mem_usg," MB")
    
    return df

## Creating the numpy RMSLE function better than sklear rmsle

In [48]:
def NumPyRMSLE(y_true: list, y_pred: list) -> float:
    n = len(y_true)
    rmsle = np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
    return rmsle

In [49]:
train = reduce_memory_usage(train)

Memory usage of dataframe is 137.37 MB
Memory usage became:  57.238792419433594  MB


## Defining the summary function


In [50]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    return summ

In [51]:
summary('stores',stores)

stores shape: (54, 5)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std
store_nbr,int64,0,54,1,27.5,54,27.5,15.732133
city,object,0,22,Ambato,,Santo Domingo,,
state,object,0,16,Azuay,,Tungurahua,,
type,object,0,5,A,,E,,
cluster,int64,0,17,1,8.5,17,8.481481,4.693395


In [52]:
summary('train',train)

train shape: (3000888, 6)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std
id,int32,0,3000888,0,1500443.5,3000887,1500444.0,866281.891642
date,datetime64[ns],0,1684,2013-01-01 00:00:00,,2017-08-15 00:00:00,,487 days 10:42:40.198518736
store_nbr,int8,0,54,1,27.5,54,27.5,15.585787
family,category,0,33,,,,,
sales,float32,0,379610,0.0,11.0,124717.0,357.7758,1101.997681
onpromotion,int16,0,362,0,0.0,741,2.60277,12.218882


In [53]:
summary('transactions',transactions)

transactions shape: (83488, 3)


Unnamed: 0,dtypes,null,unique,min,median,max,mean,std
date,object,0,1682,2013-01-01,,2017-08-15,,
store_nbr,int64,0,54,1,27.0,54,26.939237,15.608204
transactions,int64,0,4993,5,1393.0,8359,1694.602158,963.286644


## Merging the datasets

In [54]:
train = train.merge(stores, on='store_nbr')
train = train.merge(oil, on='date', how='left')
holidays_events = holidays_events.rename(columns={'type': 'holiday_type'})
train = train.merge(holidays_events, on='date', how='left')

## Replacing the null/na values in dataset


In [55]:
train['dcoilwtico'].fillna(method='ffill', inplace=True)
train['transferred'].fillna(False, inplace=True)
train['onpromotion'].fillna(False, inplace=True)

train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year
train['day_of_week'] = train['date'].dt.dayofweek
train['is_weekend'] = train['day_of_week'].isin([5, 6])

In [56]:
train['dcoilwtico'].fillna(method='ffill', inplace=True)
train['transferred'].fillna(False, inplace=True)
train['onpromotion'].fillna(False, inplace=True)

train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year
train['day_of_week'] = train['date'].dt.dayofweek
train['is_weekend'] = train['day_of_week'].isin([5, 6])

## Encoding the categorical columns 

In [57]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train['city'] = le.fit_transform(train['city'])
train['state'] = le.fit_transform(train['state'])
train['type'] = le.fit_transform(train['type'])
train['family'] = le.fit_transform(train['family'])
train['holiday_type'] = train['holiday_type'].fillna('None')
train['holiday_type'] = le.fit_transform(train['holiday_type'])

## Featuring engineering

In [58]:
features = ['store_nbr', 'family', 'city', 'state', 'type', 'cluster', 'onpromotion', 'day', 'month', 'year', 'day_of_week', 'is_weekend', 'dcoilwtico', 'transferred', 'holiday_type']
target = 'sales'

X = train[features]
y = train[target]

In [None]:
X = reduce_memory_usage(X)

# Splitting the dataset for test and train

In [61]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2, random_state=52)

## Defining the model

In [81]:
from sklearn.linear_model import LinearRegression
model_dict = {
    'Light GBM': LGBMRegressor(),
   'XG Boost' : XGBRegressor(),
   'Cat Boost' : CatBoostRegressor()
   
}

## Defining the evaluation function for RMSLE validation

In [82]:
def evaluation(model_str, y_pred, y_pred_train):
    results = {
        'model': model_str,
        'rmsle score': NumPyRMSLE(y_train,y_pred_train) # faster than sklearn rmsle
    }
    return results

## Capturing the RMSLE values for various models

In [83]:
result_list = []
for model in model_dict:
    model_dict[model].fit(X_train, y_train)
    y_pred = model_dict[model].predict(X_val)
    y_pred_train = model_dict[model].predict(X_train)
    result = evaluation(model, y_pred, y_pred_train)
    result_list.append(result)
df_eval = pd.DataFrame(result_list)
df_eval


Learning rate set to 0.140438
0:	learn: 1026.0952379	total: 247ms	remaining: 4m 6s
1:	learn: 959.4300682	total: 491ms	remaining: 4m 5s
2:	learn: 908.6146873	total: 737ms	remaining: 4m 5s
3:	learn: 872.9239539	total: 948ms	remaining: 3m 55s
4:	learn: 848.6765278	total: 1.19s	remaining: 3m 56s
5:	learn: 810.4649502	total: 1.45s	remaining: 3m 59s
6:	learn: 781.3457396	total: 1.71s	remaining: 4m 1s
7:	learn: 762.5757397	total: 1.93s	remaining: 3m 59s
8:	learn: 749.5299766	total: 2.17s	remaining: 3m 58s
9:	learn: 727.6409959	total: 2.42s	remaining: 4m
10:	learn: 710.8908876	total: 2.7s	remaining: 4m 2s
11:	learn: 698.1424046	total: 2.95s	remaining: 4m 2s
12:	learn: 688.2428772	total: 3.17s	remaining: 4m
13:	learn: 680.9747990	total: 3.39s	remaining: 3m 58s
14:	learn: 672.4509302	total: 3.67s	remaining: 4m 1s
15:	learn: 663.6277555	total: 3.92s	remaining: 4m
16:	learn: 658.1787992	total: 4.15s	remaining: 3m 59s
17:	learn: 651.1658303	total: 4.36s	remaining: 3m 57s
18:	learn: 638.0313122	tota

Unnamed: 0,model,rmsle score
0,Light GBM,2.229203
1,XG Boost,2.031667
2,Cat Boost,1.974046


## Reading the test data and prepare for prediction

In [84]:
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv', parse_dates=['date'])

test['date'] = pd.to_datetime(test['date'])

test = test.merge(stores, on='store_nbr')
test = test.merge(oil, on='date', how='left')
test = test.merge(holidays_events, on='date', how='left')

test['dcoilwtico'].fillna(method='ffill', inplace=True)
test['transferred'].fillna(False, inplace=True)
test['onpromotion'].fillna(False, inplace=True)

test['day'] = test['date'].dt.day
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year
test['day_of_week'] = test['date'].dt.dayofweek
test['is_weekend'] = test['day_of_week'].isin([5, 6])

test['city'] = le.fit_transform(test['city'])
test['state'] = le.fit_transform(test['state'])
test['type'] = le.fit_transform(test['type'])
test['family'] = le.fit_transform(test['family'])
test['holiday_type'] = test['holiday_type'].fillna('None')
test['holiday_type'] = le.fit_transform(test['holiday_type'])

In [85]:
X_test = test[features].copy()

## Executing the best model with lowest RMSLE 

In [94]:
model = CatBoostRegressor()
model.fit(X, y)
sales = model.predict(X_test)
df_test = pd.DataFrame(data={'id': test['id'],'sales': sales})
df_test.to_csv('submission.csv', index=False)
df_test

CatBoostError: catboost/private/libs/options/json_helper.h:173: Can't parse parameter "learning_rate" with value: [0.01,0.02,0.03,0.04]