In [1]:
# Importing the dependencies 

import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import model_selection, preprocessing
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from datacleaner import autoclean
from sklearn.model_selection import GridSearchCV, train_test_split
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


In [2]:
# Read the CSV
merged_df = pd.read_csv('Merged.csv')

In [3]:
# Display the contents
merged_df.head()

Unnamed: 0.1,Unnamed: 0,Date,Dept,IsHoliday,Store,Weekly_Sales,Type,Size,Temperature,Fuel_Price,...,CPI,Unemployment,NewMarkDown1,NewMarkDown2,NewMarkDown3,NewMarkDown4,NewMarkDown5,Year,Month,Day
0,0,2010-02-05,1,False,1,24924.5,A,151315,42.31,2.572,...,211.096358,8.106,0.0,0.0,0.0,0.0,0.0,2010,2,5
1,1,2010-02-12,1,True,1,46039.49,A,151315,38.51,2.548,...,211.24217,8.106,0.0,0.0,0.0,0.0,0.0,2010,2,12
2,2,2010-02-19,1,False,1,41595.55,A,151315,39.93,2.514,...,211.289143,8.106,0.0,0.0,0.0,0.0,0.0,2010,2,19
3,3,2010-02-26,1,False,1,19403.54,A,151315,46.63,2.561,...,211.319643,8.106,0.0,0.0,0.0,0.0,0.0,2010,2,26
4,4,2010-03-05,1,False,1,21827.9,A,151315,46.5,2.625,...,211.350143,8.106,0.0,0.0,0.0,0.0,0.0,2010,3,5


In [4]:
# Drop rows where 'Weekly_Sales' is na
print(merged_df.count())
merged_df = merged_df.dropna(subset=['Weekly_Sales'])
print(merged_df.count())

Unnamed: 0      536634
Date            536634
Dept            536634
IsHoliday       536634
Store           536634
Weekly_Sales    421570
Type            536634
Size            536634
Temperature     536634
Fuel_Price      536634
MarkDown1       265596
MarkDown2       197685
MarkDown3       242326
MarkDown4       237143
MarkDown5       266496
CPI             498472
Unemployment    498472
NewMarkDown1    536634
NewMarkDown2    536634
NewMarkDown3    536634
NewMarkDown4    536634
NewMarkDown5    536634
Year            536634
Month           536634
Day             536634
dtype: int64
Unnamed: 0      421570
Date            421570
Dept            421570
IsHoliday       421570
Store           421570
Weekly_Sales    421570
Type            421570
Size            421570
Temperature     421570
Fuel_Price      421570
MarkDown1       150681
MarkDown2       111248
MarkDown3       137091
MarkDown4       134967
MarkDown5       151432
CPI             421570
Unemployment    421570
NewMarkDown1    42157

In [5]:
#Label-encode the 'IsHoliday' column - stopping with label encoding as this column has only two unique values
label_encoder = LabelEncoder()
merged_df['newHoliday'] = label_encoder.fit_transform(merged_df['IsHoliday'])

#Use get_dummies to one-hot encode the 'Type' column - one-hot encoding this column as it has >2 unique values
encoded_df = pd.get_dummies(merged_df, columns=["Type", "Dept", "Store"])

In [6]:
#Drop the original 'Date' and 'IsHoliday' columns from the new dataframe
#Drop the markdown columns
encoded_df.drop(['Unnamed: 0','Date', 'IsHoliday', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5' ], axis=1, inplace=True)

In [7]:
# Display the contents
encoded_df.head()

Unnamed: 0,Weekly_Sales,Size,Temperature,Fuel_Price,CPI,Unemployment,NewMarkDown1,NewMarkDown2,NewMarkDown3,NewMarkDown4,...,Store_36,Store_37,Store_38,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45
0,24924.5,151315,42.31,2.572,211.096358,8.106,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,46039.49,151315,38.51,2.548,211.24217,8.106,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,41595.55,151315,39.93,2.514,211.289143,8.106,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,19403.54,151315,46.63,2.561,211.319643,8.106,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,21827.9,151315,46.5,2.625,211.350143,8.106,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
encoded_df['Weekly_Sales'].max()

693099.36

In [9]:

target = encoded_df['Weekly_Sales'].values
target = target.reshape(-1,1)
features = encoded_df.drop('Weekly_Sales', axis=1)
feature_names = features.columns

In [10]:
# Split Train and Test
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=test_size, random_state=seed)

In [11]:

X_scaler = MinMaxScaler().fit(X_train)
y_scaler = MinMaxScaler().fit(y_train)

In [12]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [13]:
XGBModel = XGBRegressor()
XGBModel.fit(X_train_scaled, y_train_scaled , verbose=False)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [14]:
from sklearn.metrics import mean_absolute_error

XGBpredictions = XGBModel.predict(X_test_scaled)
MAE = mean_absolute_error(y_test_scaled , XGBpredictions)
print('XGBoost validation MAE = ',MAE) #  /encoded_df['Weekly_Sales'].mean())


XGBoost validation MAE =  0.011554886737199661


In [15]:
# Converting two dimensional array to 1D to fit in a DataFrame

X_test_final = X_test_scaled.ravel()
y_test_final = y_test_scaled.ravel()
XGBPredict = XGBpredictions.ravel()

In [16]:
# len(y_test_scaled)
X_test_scaled.shape

(139119, 143)

In [17]:
len(y_test_final)

139119

In [18]:
len(XGBPredict)

139119

In [19]:
X_test_scaled

array([[0.02725349, 0.77524462, 0.63977956, ..., 0.        , 0.        ,
        0.        ],
       [0.48099293, 0.23385519, 0.46042084, ..., 0.        , 0.        ,
        0.        ],
       [0.45836739, 0.59559687, 0.78857715, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.03348904, 0.49735812, 0.02104208, ..., 1.        , 0.        ,
        0.        ],
       [0.66706902, 0.52172211, 0.1508016 , ..., 0.        , 0.        ,
        0.        ],
       [0.90627723, 0.89941292, 0.54208417, ..., 0.        , 0.        ,
        0.        ]])

In [20]:
final_df = pd.DataFrame({"Y Test":y_test_final, "Predictions":XGBPredict})
final_df.head(30)

Unnamed: 0,Y Test,Predictions
0,0.003969,0.014157
1,0.005636,0.01578
2,0.011,0.01578
3,0.005523,0.01567
4,0.073214,0.074541
5,0.033458,0.027968
6,0.005427,0.016657
7,0.003195,0.016657
8,0.002987,0.009709
9,0.016171,0.022752


In [21]:
final_df.index.values

array([     0,      1,      2, ..., 139116, 139117, 139118])

In [22]:
export_csv = final_df.to_csv('Predictions.csv')