In [42]:
#!pip install chart-studio
import os
import h2o
import pandas as pd
import datetime as dt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas import Series
import csv
import plotly as plotly
%matplotlib inline
import pandas_profiling as pp
import warnings
warnings.filterwarnings('ignore')
import chart_studio.plotly as py
import scipy.stats as st
import seaborn as sns
sns.set(style="white", palette="muted", color_codes=True)
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

In [43]:
data_type = {'store': 'int8', 'item': 'int8', 'sales': 'int16'}

In [44]:
features = pd.read_csv('train.csv',parse_dates= ['date'], dtype= data_type)

In [45]:
features.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [46]:
features.describe()

Unnamed: 0,store,item,sales
count,913000.0,913000.0,913000.0
mean,5.5,25.5,52.250287
std,2.872283,14.430878,28.801144
min,1.0,1.0,0.0
25%,3.0,13.0,30.0
50%,5.5,25.5,47.0
75%,8.0,38.0,70.0
max,10.0,50.0,231.0


we profile all the columns using pandas profiling 

In [47]:
pp.ProfileReport(features)

Summarize dataset:   0%|          | 0/17 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Total GI
Graph of average monthly GI for all stores and items:

In [32]:
monthly_df = features.groupby([features.date.dt.year, features.date.dt.month])['sales'].mean()
monthly_df.index = monthly_df.index.set_names(['year', 'month'])
monthly_df = monthly_df.reset_index()
x_axis = []
for y in range(13, 18):
    for m in range(1,12):
        x_axis.append("{}/{}".format(m,y))
trace = go.Scatter(x= x_axis, y= monthly_df.sales, mode= 'lines+markers', name= 'GI avg per month', line=dict(width=3))
layout = go.Layout(autosize=True, title= 'GI - average per month', showlegend=True)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

GI are slowing increasing each year and there is a clear **seasonality effect**


## Total GI by year

In [38]:
year_df = features.groupby(features.date.dt.year)['sales'].sum().to_frame()

trace = go.Bar(
    y= year_df.sales, x= ['2013','2014','2015','2016','2017'],
    marker=dict(color='rgba(179, 143, 0, 0.6)', line=dict(color='rgba(179, 143, 0, 1.0)', width=1)),
    name='Total GI by year', orientation='v'
)

layout = go.Layout(autosize=False, title= 'Total GI by year', showlegend=True, width=600, height=400)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)


# GI by store
Average GI per month and store:

In [34]:
monthly_df = features.groupby([features.date.dt.year, features.date.dt.month, 'store']).mean()
monthly_df.index = monthly_df.index.set_names(['year', 'month', 'store'])
monthly_df = monthly_df.reset_index()

traces = []
for i in range(1, 11):
    store_sales = monthly_df[monthly_df.store == i]
    trace = go.Scatter(x= x_axis, y= store_sales.sales, mode= 'lines+markers', name= 'Warehouse '+str(i), line=dict(width=3))
    traces.append(trace)
layout = go.Layout(autosize=True, title= 'GI - average per month', showlegend=True)
fig = go.Figure(traces, layout=layout)
iplot(fig)

# GI per store - bar chart

In [39]:
store_total = features.groupby(['store'])['sales'].sum().to_frame().reset_index()
store_total.sort_values(by = ['sales'], ascending=True, inplace=True)
labels = ['Warehouse {}'.format(i) for i in store_total.store]

trace = go.Bar(
    y= store_total.sales, x= labels,
    marker=dict(color='rgba(255, 65, 54, 0.6)', line=dict(color='rgba(255, 65, 54, 1.0)', width=1)),
    name='Total GI per warehouse', orientation='v'
)

layout = go.Layout(autosize=True, title= 'Total GI by warehouse')
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

# GI BoxPlot for each store


In [36]:
store_sum = features.groupby(['store', 'date'])['sales'].sum()
traces = []

for i in range(1, 11):
    s = store_sum[i].to_frame().reset_index()
    trace = go.Box(y= s.sales, name= 'Store {}'.format(i), jitter=0.8, whiskerwidth=0.2, marker=dict(size=2), line=dict(width=1))
    traces.append(trace)

layout = go.Layout(
    title='GI BoxPlot for each store',
    yaxis=dict(
        autorange=True, showgrid=True, zeroline=True,
        gridcolor='rgb(233,233,233)', zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2, gridwidth=1
    ),
    margin=dict(l=40, r=30, b=80, t=100), showlegend=False,
)

fig = go.Figure(data=traces, layout=layout)
iplot(fig)

# GI by item
We have 50 different products with total GI that goes from 335k for Item 5 to 1.6M for item 15.

In [12]:
item_total = features.groupby(['item'])['sales'].sum().to_frame().reset_index()
item_total.sort_values(by = ['sales'], ascending=False, inplace=True)
labels = ['Item {}'.format(i) for i in item_total.item]

trace = go.Bar(
    y= item_total.sales, x= labels,
    marker=dict(color='rgba(33, 33, 135, 0.6)', line=dict(color='rgba(33, 33, 135, 1.0)', width=1)),
    name='Total GI by item', orientation='v'
)
layout = go.Layout(autosize=True, title= 'GI per item (all time)')
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

# GI BoxPlot for each item


In [13]:
item_sum = features.groupby(['item', 'date'])['sales'].sum()
traces = []

for i in range(1, 51):
    s = item_sum[i].to_frame().reset_index()
    trace = go.Box(y= s.sales, name= 'Item {}'.format(i), jitter=0.8, whiskerwidth=0.2, marker=dict(size=2), line=dict(width=1))
    traces.append(trace)

layout = go.Layout(
    title='GI BoxPlot for each item',
    yaxis=dict(
        autorange=True, showgrid=True, zeroline=True,
        gridcolor='rgb(233,233,233)', zerolinecolor='rgb(255, 255, 255)',
        zerolinewidth=2, gridwidth=1
    ),
    margin=dict(l=40, r=30, b=80, t=100), showlegend=False,
)

fig = go.Figure(data=traces, layout=layout)
iplot(fig)


## XGBOOST

Importing the Xgboost library

In [14]:
import datetime
start_date = datetime.date(2018, 1, 1)
end_date = datetime.date(2018, 6, 1)

In [15]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import os
import pickle

Importing the test and training dataset

In [16]:
train = pd.read_csv("train.csv")

from datetime import timedelta, date
import pandas as pd
data = []
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)


i = 1
for itemId in range(1,51):
    for storeId in range(1,11):
        for single_date in daterange(start_date, end_date):
            #print(single_date.strftime("%m/%d/%Y"),storeId,itemId)
            data.append([i,single_date.strftime("%m/%d/%Y"),storeId,itemId])
            i = i+1
test = pd.DataFrame(data,columns = ['id','date','store','item'])


In [17]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [18]:
train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.dayofweek
train['year'] = train['date'].dt.year
test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.dayofweek
test['year'] = test['date'].dt.year

In [19]:
col = [i for i in test.columns if i not in ['date','id']]
y = 'sales'

In [20]:
#split the data into training data and test data

train_x, train_cv, y, y_cv = train_test_split(train[col],train[y], test_size=0.2, random_state=2018)

creating the model with all the parameters which we will use to train the model using XGB

In [21]:
def XGB_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=2017, num_rounds=2500):
    param = {}
    param['objective'] = 'reg:linear'
    param['eta'] = 0.025
    param['max_depth'] = 7
    param['silent'] = 1
    param['eval_metric'] = 'mae'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())

    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)
        
    return model    
 

In [None]:
model = XGB_regressor(train_X = train_x, train_y = y, test_X = train_cv, test_y = y_cv)
import pickle
filename = 'model.sav'
pickle.dump(model, open(filename, 'wb'))

In [23]:
filename = 'model.sav'
model = pickle.load(open(filename, 'rb'))

In [24]:
y_test = model.predict(xgb.DMatrix(test[col]), ntree_limit = model.best_ntree_limit)

save the file to the CSV file and submit it to the competition

In [25]:
sample= pd.DataFrame()
sample['sales'] = y_test

In [26]:
data_type = {'store': 'int8', 'item': 'int8', 'sales': 'float64'}
prediction = pd.concat([test, sample], axis=1)
prediction = prediction[['id','date','store','item','sales']]
prediction

Unnamed: 0,id,date,store,item,sales
0,1,2018-01-01,1,1,12.906287
1,2,2018-01-02,1,1,14.747955
2,3,2018-01-03,1,1,14.770655
3,4,2018-01-04,1,1,14.504540
4,5,2018-01-05,1,1,16.468857
...,...,...,...,...,...
75495,75496,2018-05-27,10,50,110.352669
75496,75497,2018-05-28,10,50,75.262924
75497,75498,2018-05-29,10,50,85.707619
75498,75499,2018-05-30,10,50,86.312103


In [40]:
monthly_df = prediction.groupby([prediction.date.dt.year, prediction.date.dt.month])['sales'].mean()
monthly_df.index = monthly_df.index.set_names(['year', 'month'])
monthly_df = monthly_df.reset_index()
x_axis = []
for y in range(18, 25):
    for m in range(1,13):
        x_axis.append("{}/{}".format(m,y))
trace = go.Scatter(x= x_axis, y= monthly_df.sales, mode= 'lines+markers', name= 'GI avg per month', line=dict(width=3))
layout = go.Layout(autosize=True, title= 'Predicted GI for chosen months- average per month', showlegend=True)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [28]:
monthly_df = prediction.groupby([prediction.date.dt.year, prediction.date.dt.month, 'store']).mean()
monthly_df.index = monthly_df.index.set_names(['year', 'month', 'store'])
monthly_df = monthly_df.reset_index()

traces = []
for i in range(1, 13):
    store_sales = monthly_df[monthly_df.store == i]
    trace = go.Scatter(x= x_axis, y= store_sales.sales, mode= 'lines+markers', name= 'Warhouse '+str(i), line=dict(width=3))
    traces.append(trace)
layout = go.Layout(autosize=True, title= 'Predicted GI for chosen months - average per month', showlegend=True)
fig = go.Figure(traces, layout=layout)
iplot(fig)

In [29]:
#combinedresult = pd.read_csv('CombinedResult.csv',parse_dates= ['date'], dtype= data_type)
train = pd.read_csv("train.csv")
train['date'] = pd.to_datetime(train['date'])
#train
prediction.drop(['id'],axis=1,inplace = True)
combinedresult = train.append([prediction])
combinedresult

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13.000000
1,2013-01-02,1,1,11.000000
2,2013-01-03,1,1,14.000000
3,2013-01-04,1,1,13.000000
4,2013-01-05,1,1,10.000000
...,...,...,...,...
75495,2018-05-27,10,50,110.352669
75496,2018-05-28,10,50,75.262924
75497,2018-05-29,10,50,85.707619
75498,2018-05-30,10,50,86.312103


In [41]:
monthly_df = combinedresult.groupby([combinedresult.date.dt.year, combinedresult.date.dt.month])['sales'].mean()
monthly_df.index = monthly_df.index.set_names(['year', 'month'])
monthly_df = monthly_df.reset_index()
x_axis = []
for y in range(13, 25):
    for m in range(1,13):
        x_axis.append("{}/{}".format(m,y))
trace = go.Scatter(x= x_axis, y= monthly_df.sales, mode= 'lines+markers', name= 'GI avg per month', line=dict(width=3))
layout = go.Layout(autosize=True, title= 'Combined Actual and Prdicted GI - average per month', showlegend=True)
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [31]:
monthly_df = combinedresult.groupby([combinedresult.date.dt.year, combinedresult.date.dt.month, 'store']).mean()
monthly_df.index = monthly_df.index.set_names(['year', 'month', 'store'])
monthly_df = monthly_df.reset_index()

traces = []
for i in range(1, 13):
    store_sales = monthly_df[monthly_df.store == i]
    trace = go.Scatter(x= x_axis, y= store_sales.sales, mode= 'lines+markers', name= 'Warehouse '+str(i), line=dict(width=3))
    traces.append(trace)
layout = go.Layout(autosize=True, title= 'Combined Actual and Prdicted GI - average per month', showlegend=True)
fig = go.Figure(traces, layout=layout)
iplot(fig)