In [None]:
import numpy as np # linear algebra
import pandas as pd #data processing, CSV file I/O (e.g. pd.read_csv)
import math

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
#sns.set_style("whitegrid")

In [None]:
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays_events = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')

 # 1. INTRODUCTION & GOAL


# 2. ANALYSIS OF TRAIN DATASET  <a class="anchor"  id="section2"></a>

In [None]:
# describe train
train.info()

In [None]:
# add store information
train = pd.merge(train, stores, how='left', on='store_nbr')

## 2.1 KPI Variables <a class="anchor"  id="section2.1"></a>

In [None]:
fig = go.Figure(data=[go.Table(header=dict(values=['KPI', 'Value']),
                 cells=dict(values=[['Number of Stores', 'Number of Different Products', 
                                     'Window Start Date', 'Window End Date',
                                    '#Rows in training set', '#Date Points in Train Dataset'], 
                                    [train['store_nbr'].nunique(), train['family'].nunique(), 
                                     train['date'].min(), train['date'].max(),
                                    train.shape[0], train['date'].nunique()]]))
                     ])

fig.update_layout({"title": f'BASIC KPIS of TRAIN DATA'}, height=500, width=500)
fig.show()

## 2.2 CHART TIME SERIES AVG SALES ON EACH DAY <a class="anchor"  id="section2.2"></a>

In [None]:
train_aux = train[['date', 'sales', 'onpromotion']].groupby('date').mean()
train_aux = train_aux.reset_index()
fig = go.Figure(data=go.Scatter(x=train_aux['date'], 
                                y=train_aux['sales'],
                                marker_color='red', text="sales"))
fig.update_layout({"title": f'Avg Sales by date for all stores and products',
                   "xaxis": {"title":"Date"},
                   "yaxis": {"title":"Avg Unit Sold"},
                   "showlegend": False})
fig.show()

## 2.3 ON PROMOTION VS AVG SALES CHART <a class="anchor"  id="section2.3"></a>

In [None]:
fig = px.scatter(train_aux[train_aux['onpromotion'] > 0], x="onpromotion", y="sales", color='sales', 
                           color_continuous_scale="earth",
                 size='sales', log_x=True, size_max=30)

fig.update_layout({"title": f'Correlation between OnPromotion and Sales (total avg sales and promotion on each day)',
                   "xaxis": {"title":"On Promotion"},
                   "yaxis": {"title":"Sales"},
                   "showlegend": False})
fig.show()

## 2.4 AVG SALES/ON PROMOTION BY YEAR, MONTH AND DAY OF WEEK CHARTS <a class="anchor"  id="section2.4"></a>

In [None]:
#create new features
train['year'] = pd.to_datetime(train['date']).dt.year
train['month'] = pd.to_datetime(train['date']).dt.strftime("%B")
train['day_of_week'] = pd.to_datetime(train['date']).dt.day_name()

In [None]:
df_year_s = train.groupby('year').mean()[['sales']]
df_year_s = df_year_s.reset_index()
df_year_s['color'] =['rgb(210, 251, 212)', 'rgb(165, 219,194)', 'rgb(123,188, 176)', 'rgb(85, 156,158)', 'rgb(58,124, 137)']

df_month_s = train.groupby('month').mean()[['sales']]
df_month_s = df_month_s.sort_values('sales', ascending=True)
df_month_s['color'] = ['#bfbf40','#abab39','#989833','#85852c','#727226','#5f5f20','#5f5f20','#4c4c19','#393913','#26260c','#131306','#000000']
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
df_month_s = df_month_s.reindex(new_order, axis=0)
df_month_s = df_month_s.reset_index()


df_day_of_week_s = train.groupby('day_of_week').mean()[['sales']]
df_day_of_week_s = df_day_of_week_s.sort_values('sales', ascending=False)
df_day_of_week_s['color'] = ['rgb(255, 0, 0)','rgb(255, 36, 36)','rgb(255, 71, 71)','rgb(255, 107, 107)','rgb(255, 143, 143)','rgb(255, 179, 179)','rgb(255, 214, 214)']
new_order_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
df_day_of_week_s = df_day_of_week_s.reindex(new_order_week, axis=0)
df_day_of_week_s = df_day_of_week_s.reset_index()

In [None]:
df_year = train.groupby('year').mean()[['onpromotion']]
df_year = df_year.reset_index()
df_year['color'] =['rgb(210, 251, 212)', 'rgb(165, 219,194)', 'rgb(123,188, 176)', 'rgb(85, 156,158)', 'rgb(58,124, 137)']



df_month = train.groupby('month').mean()[['onpromotion']]
df_month = df_month.sort_values('onpromotion', ascending=True)
df_month['color'] = ['#bfbf40','#abab39','#989833','#85852c','#727226','#5f5f20','#5f5f20','#4c4c19','#393913','#26260c','#131306','#000000']
new_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
df_month = df_month.reindex(new_order, axis=0)
df_month = df_month.reset_index()


df_day_of_week = train.groupby('day_of_week').mean()[['onpromotion']]
df_day_of_week = df_day_of_week.sort_values('onpromotion', ascending=False)
df_day_of_week['color'] = ['rgb(255, 0, 0)','rgb(255, 36, 36)','rgb(255, 71, 71)','rgb(255, 107, 107)','rgb(255, 143, 143)','rgb(255, 179, 179)','rgb(255, 214, 214)']
new_order_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
df_day_of_week = df_day_of_week.reindex(new_order_week, axis=0)
df_day_of_week = df_day_of_week.reset_index()

In [None]:
fig = make_subplots(rows=3, cols=2, 
                    subplot_titles=("Avg Sales by Year", "Avg On Promotion by Year", "Avg Sales by Month",
                                   "Avg On Promotion by MOnth", "Avg Sales by Day of Week", "Avg On Promotion by Day of Week"))
#SALES 
fig.append_trace(go.Bar(x=df_year_s['year'], y=df_year_s['sales'], marker = {'color': list(df_year_s['color'])}),
                row=1, col=1)


fig.append_trace(go.Bar(x=df_month_s['month'], y=df_month_s['sales'], marker = {'color': list(df_month_s['color'])}), 
                 row=2, col=1)

fig.append_trace(go.Bar(x=df_day_of_week_s['day_of_week'], y=df_day_of_week_s['sales'], marker = {'color': list(df_day_of_week_s['color'])}), row=3, col=1)

##ONPROMOTION
fig.append_trace(go.Bar(x=df_year['year'], y=df_year['onpromotion'], marker = {'color': list(df_year['color'])}),
                row=1, col=2)


fig.append_trace(go.Bar(x=df_month['month'], y=df_month['onpromotion'], marker = {'color': list(df_month['color'])}), 
                 row=2, col=2)

fig.append_trace(go.Bar(x=df_day_of_week['day_of_week'], y=df_day_of_week['onpromotion'],
                        marker = {'color': list(df_day_of_week['color'])}), row=3, col=2)
#styling
#fig.update_yaxes(showgrid=False, ticksuffix=' ', categoryorder='total ascending', row=1, col=1)
#fig.update_xaxes(visible=False, row=1, col=1)

fig.update_layout(height=1000, width=1400, title_text="AVERAGE SALES & ONPROMOTION ANALYSIS",  
                  title_font=dict(size=30, color='#8a8d93'), showlegend=False)
fig.show()

# 3. HOLIDAYS & EVENTS<a class="anchor"  id="section3"></a>
- Study the average sales on each day for different type of holidays. 

In [None]:
# filter holidays for the training dataset window.
holidays_events = holidays_events[(holidays_events['date'] >= "2013-01-01") & (holidays_events['date'] <= "2017-08-15")]

In [None]:
holidays_events.head()

## 3.1 Average sales chart with events<a class="anchor"  id="section3.1"></a>


In [None]:
##Let's look at the sales behavior for the whole data
train_aux = train[['date', 'sales']].groupby('date').mean()
train_aux = train_aux.reset_index()
fig = go.Figure(data=go.Scatter(x=train_aux['date'], 
                                y=train_aux['sales'],
                                marker_color='red', text="sales"))
for holiday_date in list(holidays_events['date']):
    fig.add_vline(x=holiday_date, line_width=0.5, line_dash="dash", line_color="green")
#fig.add_vline(x="2013-08-08", line_width=0.5, line_dash="dash", line_color="green", annotation="test")


fig.update_layout({"title": f'Avg Sales by date with Holidays Events',
                   "xaxis": {"title":"Date"},
                   "yaxis": {"title":"Avg Unit Sold"},
                   "showlegend": False})
fig.show()

In [None]:
df_plot = pd.merge(holidays_events, train_aux, on='date', how='inner')
df_plot.loc[df_plot['description'].isin(['Black Friday', 'Cyber Monday']), 'type'] = 'black_friday_cyber_monday'

In [None]:
#pd. set_option("display.max_rows", 300)
#df_plot

## 3.2 Avg sales on Event Dates<a class="anchor"  id="section3.2"></a>

In [None]:
fig = px.scatter(df_plot, x="date", y="sales", size='sales', color='type')
                 #size='sales', color='sales',
                  #color_continuous_scale="pinkyl")

fig.update_layout({"title": f'Avg Sales on Holiday Events days',
                   "xaxis": {"title":"HOLIDAY EVENT DATE"},
                   "yaxis": {"title":"Avg Sales"},
                   "showlegend": True})

fig.add_annotation(x='2014-07-05',y=500,xref="x",yref="y",text="WORLD CUP",showarrow=True, align="center",arrowhead=2,arrowsize=1,
        arrowwidth=2,arrowcolor="#636363",ax=0,ay=-30,bordercolor="#c7c7c7",borderwidth=2,borderpad=4,bgcolor="#ca8ee8",opacity=0.8  )

fig.add_annotation(x='2016-04-20',y=800,xref="x",yref="y",text="EARTHQUAKE",showarrow=True,align="center",arrowhead=2,arrowsize=1,
        arrowwidth=2,arrowcolor="#636363",ax=0,ay=-30,bordercolor="#c7c7c7",borderwidth=2,borderpad=4,bgcolor="#ca8ee8",opacity=0.8)

fig.add_annotation(x='2013-12-30',y=200,xref="x",yref="y",text="CHRISTAMS 13/14",showarrow=True,align="center",arrowhead=2,arrowsize=1,
        arrowwidth=2,arrowcolor="#636363",ax=0,ay=30,bordercolor="#c7c7c7",borderwidth=2,borderpad=4,bgcolor="#3ce685",opacity=0.8)


fig.add_annotation(x='2014-12-30',y=200,xref="x",yref="y",text="CHRISTAMS 14/15",showarrow=True,align="center",arrowhead=2,arrowsize=1,
        arrowwidth=2,arrowcolor="#636363",ax=0,ay=30,bordercolor="#c7c7c7",borderwidth=2,borderpad=4,bgcolor="#3ce685",opacity=0.8)


fig.add_annotation(x='2015-12-30',y=200,xref="x",yref="y",text="CHRISTAMS 15/16",showarrow=True,align="center",arrowhead=2,arrowsize=1,
        arrowwidth=2,arrowcolor="#636363",ax=0,ay=30,bordercolor="#c7c7c7",borderwidth=2,borderpad=4,bgcolor="#3ce685",opacity=0.8)


fig.add_annotation(x='2016-12-30',y=200,xref="x",yref="y",text="CHRISTAMS 16/17",showarrow=True,align="center",arrowhead=2,arrowsize=1,
        arrowwidth=2,arrowcolor="#636363",ax=0,ay=30,bordercolor="#c7c7c7",borderwidth=2,borderpad=4,bgcolor="#3ce685",opacity=0.8)



fig.show()

# 4. OIL DATA<a class="anchor"  id="section4"></a>

- Ecuador is highly dependant on oil prices, therefore the prices of some items might be affected by variations in the oil prices. 
- The oil industry is driven by booms and busts. Prices typically rise during periods of global economic strength during which demand outpaces supply. Prices fall when the reverse is true, and supply exceeds demand. Meanwhile, oil supply and demand are driven by a number of key factors:

    - Changes in the value of the U.S. dollar
    - Changes in the policies of the Organization of Petroleum Exporting Countries (OPEC)
    - Changes in the levels of oil production and inventory
    - The health of the global economy
    - The implementation (or collapse) of international agreements

In [None]:
oil.info()

In [None]:
#price of oil on 2013-01-01 (first element of the series) is missing, let´s fill it with the value of the next day and interpolate the next ones. 
oil.loc[oil['date'] == '2013-01-01', 'dcoilwtico'] = 93.14
oil = oil.interpolate(method='linear', limit=20)

## 4.1 Historic Oil Prices Chart <a class="anchor"  id="section4.1"></a>

In [None]:
##Let's look at the sales behavior for the whole data
train_aux = train[['date', 'sales']].groupby('date').mean()
train_aux = train_aux.reset_index()
fig = go.Figure(data=go.Scatter(x=oil['date'], 
                                y=oil['dcoilwtico'],
                                marker_color='blue', text="sales"))


fig.update_layout({"title": f'Oil Prices Chart',
                   "xaxis": {"title":"Date"},
                   "yaxis": {"title":"Oil Price"},
                   "showlegend": False})
fig.show()

In [None]:
#Scatter plot to the see correlation between average unit sold and oil price each day 
sales_oil = train.groupby('date').mean()['sales']
sales_oil = sales_oil.reset_index()
sales_oil = pd.merge(sales_oil, oil, on ='date', how='left')
# we don't have all the oil prices available, we impute them 
sales_oil = sales_oil.interpolate(method='linear', limit=20)

## 4.2 Avg Sales vs Oil Prices Chart <a class="anchor"  id="section4.1"></a>

In [None]:
fig = px.scatter(sales_oil, x="dcoilwtico", y="sales", size='sales', color='sales',
                  color_continuous_scale="pinkyl")

fig.update_layout({"title": f'Correlation between Oil Prices and Sales (total avg sales and promotion each day)',
                   "xaxis": {"title":"Oil Price"},
                   "yaxis": {"title":"Sales"},
                   "showlegend": False})
fig.show()

- **<font size="2">The chart above clearly tells us that when there are lower oil prices the average units sold increases. Therefore, oil prices will be used as a variable for training.</font>**

# 5. STORE ANALYSIS <a class="anchor"  id="section5"></a>

In [None]:
## Ranking of units solds by products at each store. 
df_family = train[['family', 'sales']].groupby('family').mean().sort_values('sales', ascending=True)
df_family = df_family.reset_index()
df_family['sales'] = df_family['sales'] 

## 5.1 Top Family Products by Avg Sales <a class="anchor"  id="section5.1"></a>


In [None]:
fig = px.bar(df_family,  x='sales', y='family', color='sales', color_continuous_scale="earth")
fig.update_layout({"title": f'AVG SALES FOR EACH FAMILTY PRODUCT',
                   "xaxis": {"title":"Avg Unit Sold"},
                   "yaxis": {"title":"Category Product"},
                   "showlegend": True},
                 width=1000,
                height=700)

fig.show()

## 5.2 Avg Sales vs Store Number colored by Store Type <a class="anchor"  id="section5.2"></a>


In [None]:
## Ranking of units solds by store, taking into account all products.
df_store = train[['store_nbr', 'sales']].groupby('store_nbr').mean().sort_values('sales', ascending=False)
df_store = df_store.reset_index()
#df_store['store_nbr'] = 'store_' + df_store['store_nbr'].astype(str)
df_store['sales'] = df_store['sales'] 
df_store = pd.merge(df_store, stores, how='left')
df_store['store_nbr'] = df_store['store_nbr'].astype(str)

In [None]:
fig = px.bar(df_store, x='store_nbr', y='sales', color='type', category_orders={"store_nbr": list(df_store['store_nbr']),
                             "sales": list(df_store['sales'])
                              })
fig.update_layout({"title": f'AVG SALES FOR EACH STORE NUMBER',
                   "xaxis": {"title":"STORE NUMBER"},
                   "yaxis": {"title":"Avg Unit Sold"},
                   "showlegend": True})


fig.show()

## 5.3 Number of stores in each city by type of store<a class="anchor"  id="section5.3"></a>

In [None]:
df = stores.groupby(['city', 'type']).count()[['store_nbr']].reset_index(level=0).reset_index(level=0)[['city', 'type', 'store_nbr']]
map_colors = {'A': '#4d4d00', 'B':'#999900', 'C':'#e6e600', 'D':'#ffff00', 'E':'#ffff99'}
df['colors'] = df['type'].map(map_colors)

In [None]:
fig = make_subplots(rows=1, cols=2,  specs=[[{"type": "bar"}, {"type": "pie"}]],
                    subplot_titles=("Number of Stores in each City by Type", "Proportion of number of cities in Ecuador"))
#SALES 
fig.append_trace(go.Bar(x=df['city'], y=df['store_nbr'], text = df["type"], marker=dict(color= df['colors'])),
                row=1, col=1)


fig.append_trace(go.Pie(values=stores['city'].value_counts(), labels=stores['city'].value_counts().index,
                       hoverinfo='label+percent+value', textinfo='label+percent'),
                 row=1, col=2)




##styling
#fig.update_yaxes(showgrid=False, ticksuffix=' ', categoryorder='total ascending', row=1, col=1)
fig.update_xaxes(visible=True, row=1, col=1)

fig.update_layout(template="ggplot2",
                  bargap=0.4,
                  height=700,
                  width=1300,
                  showlegend=False)

fig.show()

## 5.4 Avg Sales vs On promotion in each city displaying number of stores<a class="anchor"  id="section5.4"></a>


In [None]:
df_cities = stores['city'].value_counts().reset_index().rename(columns={"index":"city", "city":"num_store_per_city"}) # number of stores per city
df_store_aux = train.groupby('city').mean()[['sales', 'onpromotion']]
df_store_aux = df_store_aux.reset_index()
df_store_aux = pd.merge(df_store_aux, df_cities, on='city', how='left')
fig = px.scatter(df_store_aux, x="sales", y="onpromotion", color='city', 
                  text=df_store_aux['city'],
                 size='num_store_per_city', log_x=True, size_max=30)

fig.update_layout({"title": f'Avg Sales vs On Promotion for Each City. Size of bubbles is number of store in each city.',
                   "xaxis": {"title":" Avg Sales"},
                   "yaxis": {"title":"On Promotion"},
                   "showlegend": True})
fig.show()

## 5.5 Avg Total Units Sold by State and City <a class="anchor"  id="section5.5"></a>
- **<font size="2"> This section aims to identify the cities and states that sell the most. </font>**


In [None]:
## Ranking of units solds by store, taking into account all products.
df_store = train[['state', 'sales']].groupby('state').mean().sort_values('sales', ascending=False)
df_store = df_store.reset_index()
#df_store['store_nbr'] = 'store_' + df_store['store_nbr'].astype(str)
df_store['sales'] = df_store['sales'] 
df_store = pd.merge(df_store, stores, how='left')
df_store['state'] = df_store['state'].astype(str)

In [None]:
fig = px.bar(df_store, x='state', y='sales')
fig.update_layout({"title": f'TOTAL SALES FOR EACH STATE',
                   "xaxis": {"title":"STORE NUMBER"},
                   "yaxis": {"title":"Avg Unit Sold"},
                   "showlegend": True},template="ggplot2")


fig.show()

In [None]:
## Ranking of units solds by store, taking into account all products.
df_store = train[['city', 'sales']].groupby('city').sum().sort_values('sales', ascending=False)
df_store = df_store.reset_index()
#df_store['store_nbr'] = 'store_' + df_store['store_nbr'].astype(str)
df_store['sales'] = df_store['sales'] 
df_store = pd.merge(df_store, stores, how='left')
df_store['city'] = df_store['city'].astype(str)

In [None]:
fig = px.bar(df_store, x='city', y='sales', category_orders={"city": list(df_store['city']),
                             "sales": list(df_store['sales'])
                              })
fig.update_layout({"title": f'TOTAL SALES FOR EACH CITY',
                   "xaxis": {"title":"STORE NUMBER"},
                   "yaxis": {"title":"Avg Unit Sold"},
                   "showlegend": True},template="ggplot2")

fig.show()