# Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

# Path

In [None]:
path = '/kaggle/input/store-sales-time-series-forecasting/'
os.listdir(path)

# Load Data

In [None]:
data_oil = pd.read_csv(path+'oil.csv')
train_data = pd.read_csv(path+'train.csv', index_col=0)
test_data = pd.read_csv(path+'test.csv', index_col=0)
samp_subm = pd.read_csv(path+'sample_submission.csv')
data_holi = pd.read_csv(path+'holidays_events.csv')
data_store =  pd.read_csv(path+'stores.csv')
data_trans = pd.read_csv(path+'transactions.csv')

# Overview

In [None]:
print('Number of train samples: ', len(train_data.index))
print('Number of test samples: ', len(test_data.index))
print('Number of features: ', len(train_data.columns))

In [None]:
train_data.head()

In [None]:
test_data.head()

# Exploratory Data Analysis

## Feature family
The feature family has 33 categorical values which we have to encode later. The values are evenly distributed.

In [None]:
train_data['family'].value_counts()[0:3]

## Oil Data
Daily oil price. Includes values during both the train and test data timeframes. (Ecuador is an oil-dependent country and it's economical health is highly vulnerable to shocks in oil prices.)

In [None]:
data_oil.head()

## Store Data
* Store metadata, including city, state, type, and cluster.
* cluster is a grouping of similar stores.

In [None]:
data_store.head()

In [None]:
data_store['city'].value_counts()[0:3]

## Transaction Data

In [None]:
data_trans.head()

In [None]:
data_holi.head()

# Feature Engineering

In [None]:
features = ['store_nbr', 'family', 'onpromotion']
target = 'sales'

## Create Feature Weekday
Based on the feature date we can create the features weekday, month or year.

In [None]:
def extract_weekday(s):
    return s.dayofweek

def extract_month(s):
    return s.month

def extract_year(s):
    return s.year

In [None]:
train_data['date'] = pd.to_datetime(train_data['date'])
train_data['weekday'] = train_data['date'].apply(extract_weekday)
train_data['year'] = train_data['date'].apply(extract_year)
train_data['month'] = train_data['date'].apply(extract_month)

test_data['date'] = pd.to_datetime(test_data['date'])
test_data['weekday'] = test_data['date'].apply(extract_weekday)
test_data['year'] = test_data['date'].apply(extract_year)
test_data['month'] = test_data['date'].apply(extract_month)

In [None]:
features.append('weekday')
features.append('year')
features.append('month')

## Encode Categorical Labels

In [None]:
enc = preprocessing.LabelEncoder()
enc.fit(train_data['family'])

In [None]:
train_data['family'] = enc.transform(train_data['family'])
test_data['family'] = enc.transform(test_data['family'])

# Define Train, Val And Test Data

In [None]:
X_train = train_data[features]
y_train = train_data[target]
X_test = test_data[features]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.33, random_state=2021)

# Simple Model
First we start with a simple model based on the feature in the train and test data.

XGB Regression:

In [None]:
model = XGBRegressor(objective='reg:squaredlogerror', n_estimators=200)
model.fit(X_train, y_train)
y_val_pred = model.predict(X_val)
y_val_pred = np.where(y_val_pred<0, 0, y_val_pred)
print('Root Mean Squared Logaritmic Error:', np.sqrt(mean_squared_log_error(y_val, y_val_pred)))

Linear Regression:

In [None]:
reg = LinearRegression(normalize=True).fit(X_train, y_train)
y_val_pred = reg.predict(X_val)
y_val_pred = np.where(y_val_pred<0, 0, y_val_pred)
print('Root Mean Squared Logaritmic Error:', np.sqrt(mean_squared_log_error(y_val, y_val_pred)))

# Predict Test Data

In [None]:
y_test_XGB = model.predict(X_test)
y_test_REG = model.predict(X_test)
samp_subm[target] = (0.8*y_test_XGB+0.2*y_test_REG)

In [None]:
samp_subm[target] = np.where(samp_subm[target]<0, 0, samp_subm[target])

# Export

In [None]:
samp_subm.to_csv('submission.csv', index=False)