### Setup

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import seaborn
%matplotlib inline

# project paths
project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

data_path = os.path.join(project_root_dir, "data")
os.makedirs(data_path, exist_ok=True)

# function for reading data
def read_data(filename, date_cols=None, file_path=data_path):
    csv_path = os.path.join(file_path, filename)
    return pd.read_csv(csv_path, parse_dates=date_cols)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)

### Read Data 

In [2]:
train = read_data("TRAIN.CSV", date_cols=["Date"])
test = read_data("TEST_FINAL.csv", date_cols=["Date"])
submission = read_data("SAMPLE.csv")

## Preparing Data For ML

### Feature Engineering

In [3]:
train.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [4]:
# create day, month and year from Date column

train['Day'] = train['Date'].dt.day
test['Day'] = train['Date'].dt.day

train['Month'] = train['Date'].dt.month
test['Month'] = train['Date'].dt.month

train['Year'] = train['Date'].dt.year
test['Year'] = train['Date'].dt.year

# Drop Date column
train.drop("Date", axis=1, inplace=True)
test.drop("Date", axis=1, inplace=True)

In [5]:
train.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Holiday,Discount,#Order,Sales,Day,Month,Year
0,T1000001,1,S1,L3,R1,1,Yes,9,7011.84,1,1,2018
1,T1000002,253,S4,L2,R1,1,Yes,60,51789.12,1,1,2018
2,T1000003,252,S3,L2,R1,1,Yes,42,36868.2,1,1,2018
3,T1000004,251,S2,L3,R1,1,Yes,23,19715.16,1,1,2018
4,T1000005,250,S2,L3,R4,1,Yes,62,45614.52,1,1,2018


In [6]:
# drop extra column from the training set
train.drop("#Order",axis=1, inplace=True)
# set the ID column as index as we are not allowed to use this column
train.set_index('ID', inplace=True)
test.set_index('ID', inplace=True)

In [7]:
train['Holiday'].value_counts()

0    163520
1     24820
Name: Holiday, dtype: int64

In [8]:
# convert int cols that should be categorical
train['Holiday'] = train['Holiday'].map({0:"No", 1:"Yes"})
test['Holiday'] = test['Holiday'].map({0:"No", 1:"Yes"})

train['Day'] = train['Day'].astype(str)
test['Day'] = test['Day'].astype(str)

train['Month'] = train['Month'].astype(str)
test['Month'] = test['Month'].astype(str)

train['Year'] = train['Year'].astype(str)
test['Year'] = test['Year'].astype(str)

In [9]:
train.head()

Unnamed: 0_level_0,Store_id,Store_Type,Location_Type,Region_Code,Holiday,Discount,Sales,Day,Month,Year
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
T1000001,1,S1,L3,R1,Yes,Yes,7011.84,1,1,2018
T1000002,253,S4,L2,R1,Yes,Yes,51789.12,1,1,2018
T1000003,252,S3,L2,R1,Yes,Yes,36868.2,1,1,2018
T1000004,251,S2,L3,R1,Yes,Yes,19715.16,1,1,2018
T1000005,250,S2,L3,R4,Yes,Yes,45614.52,1,1,2018


In [10]:
test.head()

Unnamed: 0_level_0,Store_id,Store_Type,Location_Type,Region_Code,Holiday,Discount,Day,Month,Year
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
T1188341,171,S4,L2,R3,No,No,1,1,2018
T1188342,172,S1,L1,R1,No,No,1,1,2018
T1188343,173,S4,L2,R1,No,No,1,1,2018
T1188344,174,S1,L1,R4,No,No,1,1,2018
T1188345,170,S1,L1,R2,No,No,1,1,2018


## Pipeline For ML

In [11]:
# seperate features and target
X_train = train.drop("Sales",axis=1).copy()
y_train = train['Sales'].copy()
X_test = test.copy()

In [12]:
# create preprocessing pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# select numerical and categorical columns
num_cols = X_train.select_dtypes(exclude="object").columns.tolist()
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

# numerical pipeline
num_pipe = make_pipeline(SimpleImputer(strategy="mean"))

# categorical pipeline
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

# full pipeline for data preprocessing
full_pipe = ColumnTransformer(
    [("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)]
)
full_pipe

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer())]),
                                 ['Store_id']),
                                ('cat',
                                 Pipeline(steps=[('simpleimputer',
                                                  SimpleImputer(fill_value='NA',
                                                                strategy='constant')),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['Store_Type', 'Location_Type', 'Region_Code',
                                  'Holiday', 'Discount', 'Day', 'Month',
                                  'Year'])])

# ML Models

## Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

In [14]:
lin_reg = make_pipeline(full_pipe, LinearRegression())
scores = cross_val_score(lin_reg, X_train, y_train, cv=5, scoring="neg_mean_squared_log_error")
print("Scores:", -scores)
print("Average score:", np.mean(-scores))

Scores: [1.28106205e-01 8.74904369e+01 3.12747146e+02 7.42359763e+01
 1.22714349e-01]
Average score: 94.94487592405655


## Support Vector Machines

In [15]:
from sklearn.svm import LinearSVR

svr = make_pipeline(full_pipe, LinearSVR(random_state=42))
scores = cross_val_score(svr, X_train, y_train, cv=5, scoring="neg_mean_squared_log_error")
print("Scores:", -scores)
print("Average score:", np.mean(-scores))

Scores: [0.10788181 0.07460828 0.08580756 0.20434581 0.13214094]
Average score: 0.12095687881569946


## Decision Tree

In [16]:
from sklearn.tree import DecisionTreeRegressor

tree = make_pipeline(full_pipe, DecisionTreeRegressor(random_state=42))
scores = cross_val_score(tree, X_train, y_train, cv=5, scoring="neg_mean_squared_log_error")
print("Scores:", -scores)
print("Average score:", np.mean(-scores))

Scores: [0.16132716 0.10829706 0.15281566 0.24520876 0.18230933]
Average score: 0.16999159340465222


## Random Forest

In [17]:
from sklearn.ensemble import RandomForestRegressor

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42))
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring="neg_mean_squared_log_error")
print("Scores:", -scores)
print("Average score:", np.mean(-scores))

Scores: [0.09297917 0.0778627  0.11546673 0.20336199 0.13669223]
Average score: 0.12527256397290004


## XGBoost

In [18]:
# from xgboost import XGBRegressor

# xgb = make_pipeline(full_pipe, XGBRegressor(random_state=42))
# scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring="neg_mean_squared_log_error")
# print("Scores:", -scores)
# print("Average score:", np.mean(-scores))

In [19]:
any(y_train < 0)

False

### Make Submission

In [20]:
submission.head()

Unnamed: 0,ID,Sales
0,T1188341,42275
1,T1188342,42275
2,T1188343,42275
3,T1188344,42275
4,T1188345,42275


In [21]:
from sklearn.svm import LinearSVR

svr = make_pipeline(full_pipe, LinearSVR(random_state=42))
svr.fit(X_train, y_train)
submission['Sales'] = svr.predict(X_test)
save_dataframe(submission, "svr_default.csv")

In [22]:
from sklearn.ensemble import RandomForestRegressor

rf = make_pipeline(full_pipe, RandomForestRegressor(random_state=42))
rf.fit(X_train, y_train)
submission['Sales'] = rf.predict(X_test)
save_dataframe(submission,"rf_default.csv")

In [23]:
from xgboost import XGBRegressor

xgb = make_pipeline(full_pipe, XGBRegressor(random_state=42))
xgb.fit(X_train, y_train)
submission['Sales'] = xgb.predict(X_test)
save_dataframe(submission, "xgb_default.csv")