In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import time

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, \
                                    median_absolute_error, max_error, r2_score, explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor

In [10]:
import warnings
warnings.filterwarnings("ignore")

In [11]:
train_data   = pd.read_csv("./store-sales-time-series-forecasting/train.csv")
test_data    = pd.read_csv("./store-sales-time-series-forecasting/test.csv")
holidays     = pd.read_csv("./store-sales-time-series-forecasting/holidays_events.csv")
oil          = pd.read_csv("./store-sales-time-series-forecasting/oil.csv")
stores       = pd.read_csv("./store-sales-time-series-forecasting/stores.csv")
transactions = pd.read_csv("./store-sales-time-series-forecasting/transactions.csv")

### EDA - Data Preprocessing

In [12]:
print(f"train data shape          : {train_data.shape}")
print(f"test data shape           : {test_data.shape}")
print(f"oil data shape            : {oil.shape}")
print(f"holidays data shape       : {holidays.shape}")
print(f"stores data shape         : {stores.shape}")
print(f"transactions data shape   : {transactions.shape}")

train data shape          : (3000888, 6)
test data shape           : (28512, 5)
oil data shape            : (1218, 2)
holidays data shape       : (350, 6)
stores data shape         : (54, 5)
transactions data shape   : (83488, 3)


In [13]:
train_data.drop(columns = ['id'], inplace=True)
test_data.drop(columns = ['id'], inplace=True)

In [15]:
train_data.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion
0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,2013-01-01,1,BABY CARE,0.0,0
2,2013-01-01,1,BEAUTY,0.0,0
3,2013-01-01,1,BEVERAGES,0.0,0
4,2013-01-01,1,BOOKS,0.0,0


In [16]:
test_data.head()

Unnamed: 0,date,store_nbr,family,onpromotion
0,2017-08-16,1,AUTOMOTIVE,0
1,2017-08-16,1,BABY CARE,0
2,2017-08-16,1,BEAUTY,2
3,2017-08-16,1,BEVERAGES,20
4,2017-08-16,1,BOOKS,0


In [17]:
oil.head()

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2


In [18]:
holidays.head()

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False


In [19]:
stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [20]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [21]:
# merging train data with other data
train_data = train_data.merge(oil, on="date", how="left")
train_data = train_data.merge(holidays, on="date", how="left")
train_data = train_data.merge(stores, on="store_nbr", how="left")
train_data = train_data.merge(transactions, on=["date", "store_nbr"], how="left")

In [22]:
# merging test data with other data
test_data = test_data.merge(oil, on="date", how="left")
test_data = test_data.merge(holidays, on="date", how="left")
test_data = test_data.merge(stores, on="store_nbr", how="left")
test_data = test_data.merge(transactions, on=["date", "store_nbr"], how="left")

In [24]:
# getting the target feature from the dataset
target_feat = list(set(train_data.columns) - set(test_data.columns))[0]

In [None]:
# getting all the data types in the dataset into a pandas dataframe
x