In [None]:
import pandas as pd
import os
import numpy as np

import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

In [5]:
import sys
import os

# Get the current folder path
current_folder = os.getcwd()
# Add the parent directory to sys.path
parent_folder = os.path.abspath(os.path.join(current_folder, ".."))
sys.path.append(parent_folder)

# Import the utility module
import utility  # Now you can use utility.py as a module import utility

In [6]:
#import data: 
datapath = parent_folder  + '/Data/'
holidays = pd.read_csv(datapath + 'holidays_events.csv')

In [7]:
# Check how holidays are distributed every year. 
holidays['date'] = pd.to_datetime(holidays['date'])
holidays['year'] = holidays.date.dt.year
holidays['month-day'] = holidays['date'].dt.strftime('%m-%d')

# Show all rows
pd.set_option('display.max_rows', None)
# First: National
national = holidays[(holidays.locale == 'National') & (holidays.type == 'Holiday')]

## Type of holidays 
The different types of holiday indicates for a particular store in a particlular day, it can be:

- holiday, celebrated
- holiday, not celebrated
- not holiday, celebrated 
- not holiday, not celebrated
- event


We don't know if for a particular holiday, whether it's a day off or not is going to affect the sale. So I'll also add "whether it's celebrated (day off)" as a feature. 

## location and description 
There're three locations. Based on the location, we can determine wheter a particular store celebrate the holiday. 

Different holidays will likely have different shopping culture. Holidays should be considered separately. 

So at first glance, I will add three columns:
- Local holiday (0 is none, 1 - 27 is the different discription of the holidays.)
- Regional holiday (0 is None, 1 - 4 is four different holidays.)
- National holiday (0 is None, 1 - 29 are different holidays.)
- National Events (0 is None, 1 - 43 are different events.) 
  - Event is not recruiting. Hence it should be labeled separately. 

One problem, 2012-12-31	is both Puente Primer dia del ano	(Bridge) and Primer dia del ano-1	(Additional)
It termed out Puente Primer dia del ano	means "The bridge for new year" , Primer dia del ano means "new year eve".

Other than that, there's no overlapping holidays. 

Notice that "Bridge", "Work Day" and "Transfer" Changes every year based on the day of the week. 
While "Additional" only goes with the holiday it self. 

# Different holidays 

Not all holiday is repeated in the same day every year. 

In [8]:

table_national = national.pivot_table(index='month-day', columns='year', values='description', aggfunc=lambda x: ', '.join(x)).fillna(0)
# print(table_national)

In [9]:
# Then Regional 
Regional = holidays[(holidays.locale == 'Regional') & (holidays.type == 'Holiday')]
table_regional = Regional.pivot_table(index='month-day', columns='year', values='description', aggfunc=lambda x: ', '.join(x)).fillna(0)
# print(table_regional)

In [10]:
# Finally, local: 

Local = holidays[(holidays.locale == 'Local') & (holidays.type == 'Holiday')]
table_local = Local.pivot_table(index='month-day', columns='year', values='description', aggfunc=lambda x: ', '.join(x)).fillna(0)
# print(table_local)

Not All national holidays are of the same date. \
Regional holidays are of the day date. 

In [11]:
len(national.description.unique())

10

Local holiday is very wild. 
The  Fundacion de Guayaquil is celebrated in different ways every time. 

# Summary on holiday types 

Because how complicated the different holidays are, it's easier to use the holiday.csv as a table and not try to predict the holiday dates. 

Treat every discription separetly. 
To deal with 2012-12-31, label it as "Primer dia del ano" for now. Since new year eve affects more than just a "bridge day". (If any thing goes wrong, this is the "error" we introduce. )



In [12]:


# Create a new dataset: 
Local_holidays = holidays[holidays.locale == 'Local']
Local_holidays['local_holiday'] = Local_holidays['description']
# Set 'Local_celebrated' to True where 'type' is 'Holiday'
Local_holidays['Local_celebrated'] = Local_holidays['type'].isin(['Holiday','Bridge', 'Additional','Transfer'])
# Update 'Local_celebrated' to False where 'transferred' is True
Local_holidays.loc[Local_holidays['transferred'] == True, 'Local_celebrated'] = False
# Take only the needed 
Local_holidays = Local_holidays[['date', 'local_holiday', 'locale_name','Local_celebrated']]

# For regional too 
Regional_holidays = holidays[holidays.locale == 'Regional']
Regional_holidays['Regional_holiday'] = Regional_holidays['description']
# Set 'Local_celebrated' to True where 'type' is 'Holiday'
Regional_holidays['Regional_celebrated'] = Regional_holidays['type'].isin(['Holiday','Bridge', 'Additional','Transfer'])
# Update 'Local_celebrated' to False where 'transferred' is True
Regional_holidays.loc[Regional_holidays['transferred'] == True, 'Regional_celebrated'] = False
# Take only the needed 
Regional_holidays = Regional_holidays[['date', 'Regional_holiday', 'locale_name','Regional_celebrated']]

# Same with National 
National_holidays = holidays[holidays.locale == 'National']
National_holidays['National_holiday'] = National_holidays['description']
# Set 'Local_celebrated' to True where 'type' is 'Holiday'
National_holidays['National_celebrated'] = National_holidays['type'].isin(['Holiday','Bridge', 'Additional','Transfer'])
# Update 'Local_celebrated' to False where 'transferred' is True
National_holidays.loc[National_holidays['transferred'] == True, 'National_celebrated'] = False
# Take only the needed 
National_holidays = National_holidays[['date', 'National_holiday', 'locale_name','National_celebrated']]


# Study the effect of holidays 

Used the "new" merge for EDA purposes. 

In [None]:
# Import the combined training file with the correct holiday merge. 
#import data: 
# datapath = os.getcwd() + '/Data/'
#df = pd.read_csv(datapath + 'combined-2.csv')
df = pd.read_csv(datapath + 'merged_train_alt_new.csv')

In [24]:
# Uncomment this to see a summary of the df. 
#utility.summary(df)
#utility.df_info(df)

In [25]:
df_hol = df[(df.hol_Nat == 1) | (df.hol_Reg == 1) | (df.hol_Loc == 1)  ]
df_hol = df_hol[['date', 'year','month','day','day_of_week','city','state','hol_Nat','hol_Nat_name',
                 'hol_Reg','hol_Reg_name','hol_Loc','hol_loc_name','transferred']]
df_hol = df_hol.drop_duplicates() 
# Check if there's holiday in Aug after 15: 
print(df_hol[(df_hol.month == 8) & (df_hol.day > 15)])

               date  year  month  day  day_of_week    city       state  \
419265   2013-08-24  2013      8   24            5  Ambato  Tungurahua   
1067913  2014-08-24  2014      8   24            6  Ambato  Tungurahua   
1716561  2015-08-24  2015      8   24            0  Ambato  Tungurahua   
2366991  2016-08-24  2016      8   24            2  Ambato  Tungurahua   

         hol_Nat hol_Nat_name  hol_Reg hol_Reg_name  hol_Loc  \
419265       0.0          NaN      0.0          NaN      1.0   
1067913      0.0          NaN      0.0          NaN      1.0   
1716561      0.0          NaN      0.0          NaN      1.0   
2366991      0.0          NaN      0.0          NaN      1.0   

                hol_loc_name  transferred  
419265   Fundacion de Ambato        False  
1067913  Fundacion de Ambato        False  
1716561  Fundacion de Ambato        False  
2366991  Fundacion de Ambato        False  


In [17]:
#Confirm with the orginal holiday csv:
holidays[holidays['month-day'] == '08-24']

Unnamed: 0,date,type,locale,locale_name,description,transferred,year,month-day
16,2012-08-24,Holiday,Local,Ambato,Fundacion de Ambato,False,2012,08-24
69,2013-08-24,Holiday,Local,Ambato,Fundacion de Ambato,False,2013,08-24
132,2014-08-24,Holiday,Local,Ambato,Fundacion de Ambato,False,2014,08-24
187,2015-08-24,Holiday,Local,Ambato,Fundacion de Ambato,False,2015,08-24
271,2016-08-24,Holiday,Local,Ambato,Fundacion de Ambato,False,2016,08-24
327,2017-08-24,Holiday,Local,Ambato,Fundacion de Ambato,False,2017,08-24


In [26]:
# Check how many stores are affected.
df[df.hol_loc_name == 'Fundacion de Ambato'].store_nbr.unique()

array([23, 50])

## Observation 1:
For kaggle competition purposes: 

We only have to predict for 2017 Aug 16 - Aug 31, only **Aug 24** is a local holiday. 

And it is a **Thursday**. 

Only the city Ambato is affected with stores #23 and #50. 

## Other holiday affects 

### 1. Do all holidays affect sales for all the items in the same way?


In [None]:
#df_hol_all = df[(df.hol_Nat == 1) | (df.hol_Reg == 1) | (df.hol_Loc == 1)  ]
#df_hol_true = df_hol_all[(df_hol_all.hol_type_Holiday == 1) & (df_hol_all.transferred.isin([False, 'False']) ) ]  # Only those holidays + celebrated


In [None]:
# Check holidays:
#df_hol_all.transferred.unique()
# The transferred is not right. So for the following code I figured it manually

In [None]:
# print(df_hol_true)
#utility.summary(df_hol_all)

For the following plot: 

- For each holiday, calculate the average sale for that holiday for the stores involved.
- Calculate the average sale for the stores invovled (in that state/city) that's not on holiday/event.
- Calculate (holiday sale / non_holiday sale) 

This process is shown for each item separately then shown in figure.


It's important to note that I use only non-zero values for average calculation. Most of the zero sales indicate the specific family is not for sale at that time. Averaging over zeros can cause errors. 

In [27]:
# THe above is not the right way to deal with this. 
# Every holiday affect should be compared with their stores separately. 

df_non_hol = df[(df.hol_Nat == 0) & (df.hol_Reg == 0) & (df.hol_Loc == 0) &  (df.event == 0)]
df_hol_all = df[(df.hol_Nat == 1) | (df.hol_Reg == 1) | (df.hol_Loc == 1) ]

all_items = df['family'].unique()
df_sale_regional = pd.DataFrame({'family': all_items})


for hol in df_hol_all.hol_Reg_name.unique():
    if  pd.isna(hol):
        print("Empty holiday name:", hol)
        continue
    # Calculate the average for each item: 
    state = df_hol_all[df_hol_all.hol_Reg_name == hol].state.unique()[0]  # Take the state name 
    for item in all_items:

        # Average sales during the holiday
        average_sales_hol = df_hol_all.loc[(df_hol_all.family == item) & (df_hol_all.hol_Reg_name == hol) & (df_hol_all.sales != 0),'sales'].mean()
        # Average sales for the same state during non-holiday times
        average_sales_non = df_non_hol.loc[(df.family == item)  & (df_non_hol.state == state)  & (df.sales != 0) , 'sales'].mean()
        # Calculate the percentage change and update the DataFrame
        df_sale_regional.loc[df_sale_regional['family'] == item, hol] = (average_sales_hol / average_sales_non)
#print(df_sale)


Empty holiday name: nan


KeyboardInterrupt: 

In [None]:
#df_sale_regional

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_sale_regional.set_index('family', inplace=True)


plt.figure(figsize=(12, 8))
sns.heatmap(df_sale_regional, annot=True, cmap='coolwarm', fmt=".2f", vmin = -0.5, vmax = 2.5)
plt.title("Ratio of Sales During Regional Holidays versus Normal days", fontsize=16)
plt.xlabel("Holiday")
plt.ylabel("Family")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

### Regional holidays:

The **Provincialización de Cotopaxi** is a holiday celebrated in the Cotopaxi province of Ecuador on April 1 each year. It marks the anniversary of Cotopaxi being declared a province, which happened on April 1, 1851.

This holiday boost almost all sales. 


The Provincialización for the other states didn't seem to affect the sales as much. 

But provincializacion for Santo Domingo has a strong sales boost for books. Could be due to lack of useful data points. 



### Local Holidays 

In [None]:

# Every holiday affect should be compared with their stores separately. 

# Take sub Data frames 
#df_non_hol = df[(df.hol_Nat == 0) & (df.hol_Reg == 0) & (df.hol_Loc == 0) &  (df.event == 0)]
#df_hol_all = df[(df.hol_Nat == 1) | (df.hol_Reg == 1) | (df.hol_Loc == 1) ]
df_hol_true = df_hol_all[ df_hol_all.transferred == False  ]  # Get rid of the transferred holidays


# Initiate the df 
all_items = df['family'].unique()
df_sale_local = pd.DataFrame({'family': all_items})


for hol in df_hol_true.hol_loc_name.unique():
    if  pd.isna(hol):
        print("Empty holiday name:", hol)
        continue
    # Calculate the average for each item: 
    city = df_hol_true[df_hol_true.hol_loc_name == hol].city.unique()[0]  # Take the state name 
    for item in all_items:

        # Average sales during the holiday
        average_sales_hol = df_hol_true.loc[(df_hol_true.family == item) & (df_hol_true.hol_loc_name == hol) & (df_hol_true.sales != 0),'sales'].mean()
        # Average sales for the same state during non-holiday times
        average_sales_non = df_non_hol.loc[(df.family == item)  & (df_non_hol.city == city)  & (df.sales != 0) , 'sales'].mean()
        # Calculate the percentage change and update the DataFrame
        df_sale_local.loc[df_sale_local['family'] == item, hol] = (average_sales_hol / average_sales_non)



In [None]:
# uncomment this to see the actual dataset
#df_sale_local

In [None]:
#df_sale_regional



df_sale_local.set_index('family', inplace=True)


plt.figure(figsize=(12, 8))
sns.heatmap(df_sale_local, annot=True, cmap='coolwarm', fmt=".2f",
            vmin=-1, vmax = 3)
plt.title("Ratio of Sales During Local Holidays vs Non holidays", fontsize=16)
plt.xlabel("Holiday")
plt.ylabel("Family")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

### Local holiday summary

There're some single sales boost, may or may not be related to holiday itself. For example. the school supplies sale are highly dependent on the time of the year. 

fundacion de quito has a huge boost for sales of all kind: 
The Foundation of Quito is celebrated on December 6, commemorating the founding of the city by Spanish conquistadors in 1534. Known as "Fiestas de Quito," this is one of the most vibrant and widely celebrated holidays in Ecuador.

In general, liquor and wine can be boosted by holidays, but not always. Celebration sales are not boosted as expected. 

A lot of these holidays invovles street party, parades, and other celebrations. The sales boost in preparation of the holidays may happens before the holidays. 

Another possibility is that the supplies for these holidays are not purchased in grocery stores. 

- Conclusion: 
  
  The correlation between local holidays and sales are not simple and straight forward, and it can happen before the holidiays. 



In [None]:
# Finally for national 

# Take sub Data frames 
#df_non_hol = df[(df.hol_Nat == 0) & (df.hol_Reg == 0) & (df.hol_Loc == 0) &  (df.event == 0)]
#df_hol_all = df[(df.hol_Nat == 1) | (df.hol_Reg == 1) | (df.hol_Loc == 1) ]
# df_hol_true = df_hol_all[ df_hol_all.transferred == False  ]  # Get rid of the transferred holidays


# Initiate the df 
all_items = df['family'].unique()
df_sale_nat = pd.DataFrame({'family': all_items})


for hol in df_hol_true.hol_Nat_name.unique():
    if  pd.isna(hol):
        print("Empty holiday name:", hol)
        continue
    # Calculate the average for each item: 
    
    for item in all_items:

        # Average sales during the holiday
        average_sales_hol = df_hol_true.loc[(df_hol_true.family == item) & (df_hol_true.hol_Nat_name == hol) & (df_hol_true.sales != 0),'sales'].mean()
        # Average sales for the same state during non-holiday times
        average_sales_non = df_non_hol.loc[(df.family == item)  &  (df.sales != 0) , 'sales'].mean()
        # Calculate the percentage change and update the DataFrame
        df_sale_nat.loc[df_sale_nat['family'] == item, hol] = (average_sales_hol / average_sales_non)
#print(df_sale)


In [None]:


df_sale_nat.set_index('family', inplace=True)  # If run this cell again, comment this line. 


plt.figure(figsize=(12, 8))
sns.heatmap(df_sale_nat, annot=True, cmap='coolwarm', fmt=".2f",vmin = -1, vmax = 3)
plt.title("Ratio of Sales During National Holidays vs Non holidays (Not Transferred)", fontsize=16)
plt.xlabel("Holiday")
plt.ylabel("Family")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

### National holidays 

The correlation is much more obvious. 

- There's a sale boost for frozen food during the Christmas-new year season. Also for kitchen and grocery stuff. People are all doing family cooking at this time. 
- Almost all sales are boosted slightly during the Christmas-new year season. 
- There's a strong sale increase for baby care at recupero puente navidad (the holiday bridge), not sure if there's any reason for this. 
- Liquor, wine and beer got boosted sales for almost all holidays. Especially for "primer dia de ano ", the new year.
- The sales increase for school supplies and garden supplies are likely due to seasonality rather than holiday itself.

In [None]:
# Finally for national 

# Take sub Data frames 
#df_non_hol = df[(df.hol_Nat == 0) & (df.hol_Reg == 0) & (df.hol_Loc == 0) &  (df.event == 0)]
df_hol_event = df[df.event == 1 ]


# Initiate the df 
all_items = df['family'].unique()
df_sale_event = pd.DataFrame({'family': all_items})


for hol in df_hol_event.hol_event_name.unique()[:13]:
    if  pd.isna(hol):
        print("Empty holiday name:", hol)
        continue
    # Calculate the average for each item: 
    
    for item in all_items:

        # Average sales during the holiday
        average_sales_hol = df_hol_event.loc[(df_hol_event.family == item) & (df_hol_event.hol_event_name == hol) & (df_hol_event.sales != 0),'sales'].mean()
        # Average sales for the same state during non-holiday times
        average_sales_non = df_non_hol.loc[(df.family == item)  &  (df.sales != 0) , 'sales'].mean()
        # Calculate the percentage change and update the DataFrame
        df_sale_event.loc[df_sale_event['family'] == item, hol] = (average_sales_hol / average_sales_non)



In [None]:


df_sale_event.set_index('family', inplace=True)  # If run this cell again, comment this line. 


plt.figure(figsize=(12, 8))
sns.heatmap(df_sale_event, annot=True, cmap='coolwarm', fmt=".2f",vmin = -1, vmax = 3)
plt.title("Ratio of Sales During Events vs Non holidays", fontsize=16)
plt.xlabel("Holiday")
plt.ylabel("Family")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

### Events affect. 

- For football games, people like to purchase more drinks, especially alcohol. 

In [None]:

# Take sub Data frames 
#df_non_hol = df[(df.hol_Nat == 0) & (df.hol_Reg == 0) & (df.hol_Loc == 0) &  (df.event == 0)]
df_hol_event = df[df.event == 1 ]


# Initiate the df 
all_items = df['family'].unique()
df_sale_eq = pd.DataFrame({'family': all_items})


for hol in df_hol_event.hol_event_name.unique()[13:]:
    if  pd.isna(hol):
        print("Empty holiday name:", hol)
        continue
    # Calculate the average for each item: 
    
    for item in all_items:

        # Average sales during the holiday
        average_sales_hol = df_hol_event.loc[(df_hol_event.family == item) & (df_hol_event.hol_event_name == hol) & (df_hol_event.sales != 0),'sales'].mean()
        # Average sales for the same state during non-holiday times
        average_sales_non = df_non_hol.loc[(df.family == item)  &  (df.sales != 0) , 'sales'].mean()
        # Calculate the percentage change and update the DataFrame
        df_sale_eq.loc[df_sale_eq['family'] == item, hol] = (average_sales_hol / average_sales_non)



In [28]:


df_sale_eq.set_index('family', inplace=True)  # If run this cell again, comment this line. 


plt.figure(figsize=(12, 8))
sns.heatmap(df_sale_eq, annot=True, cmap='coolwarm', fmt=".2f",vmin = -1, vmax = 3)
plt.title("Ratio of Sales During Earth Quake vs Non Holidays", fontsize=16)
plt.xlabel("Holiday")
plt.ylabel("Family")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

NameError: name 'df_sale_eq' is not defined

Events: earth quake: 

I don't see a huge decrease of sales for earth quake. 

Check the some for the local festivals 

# Plot to see it all 

The following plot marks the sales overtime by family  with labels of events and national holidays. 

Since it's the sales for all the stores, I didn't include regional or local holidays. 

"On promotion" Would be a good info to add as well but promotion is also store-dependant. 

In [None]:
# holidays[holidays.type == 'Event']

In [None]:
#To make the following plot: 

# Map numeric day_of_week to abbreviated names
day_mapping = {
    0: 'Mon',
    1: 'Tue',
    2: 'Wed',
    3: 'Thu',
    4: 'Fri',
    5: 'Sat',
    6: 'Sun'
}

# Replace numeric day_of_week with abbreviated names
df['day_of_week'] = df['day_of_week'].map(day_mapping)

In [None]:
import plotly.graph_objects as go

# Ensure 'date' is in datetime format
df['date'] = pd.to_datetime(df['date'])

# Iterate through each item in the 'family' column
for item in df['family'].unique():
    # Filter data for the specific family
    grouped_sales = df[df['family'] == item].groupby('date')['sales'].sum().reset_index()


    # Create a line plot for sales
    fig = go.Figure()

    # Add sales data as a line
    fig.add_trace(go.Scatter(
        x=grouped_sales['date'],
        y=grouped_sales['sales'],
        mode='lines',
        name='Sales',
        line=dict(color='blue'),
        showlegend=False  # Disable legend for this trace
    ))

    # Add semi-transparent vertical lines for events with holiday names
    event_data = df[df['event'] == 1][['date', 'hol_event_name']].drop_duplicates()
    for _, row in event_data.iterrows():
        holiday_date = row['date']
        holiday_name = row['hol_event_name']

        fig.add_trace(go.Scatter(
            x=[holiday_date, holiday_date],
            y=[0, grouped_sales['sales'].max()],
            mode='lines',
            line=dict(color='red', dash='dash'),
            hoverinfo='text',
            text=f"{holiday_name} ({holiday_date.date()})",
            opacity=0.3,  # Set transparency to 50%
            showlegend=False  # Disable legend for this trace
        ))
    
    # Add semi-transparent vertical lines for holidays with holiday names
    holiday_data = df[df['hol_Nat'] == 1][['date', 'hol_Nat_name']].drop_duplicates()
    for _, row in holiday_data.iterrows():
        holiday_date = row['date']
        holiday_name = row['hol_Nat_name']

        fig.add_trace(go.Scatter(
            x=[holiday_date, holiday_date],
            y=[0, grouped_sales['sales'].max()],
            mode='lines',
            line=dict(color='green', dash='dash'),
            hoverinfo='text',
            text=f"{holiday_name} ({holiday_date.date()})",
            opacity=0.3,  # Set transparency to 50%
            showlegend=False  # Disable legend for this trace
        ))

    # Add transparent vertical lines for holidays with day of the week
    week_data = df[['date', 'day_of_week']].drop_duplicates()
    for _, row in week_data.iterrows():
        holiday_date = row['date']
        weekname = row['day_of_week']

        fig.add_trace(go.Scatter(
            x=[holiday_date, holiday_date],
            y=[0, grouped_sales['sales'].max()],
            mode='lines',
            line=dict(color='green', dash='dash'),
            hoverinfo='text',
            text=f"{weekname} ({holiday_date.date()})",
            opacity=0.0,  # Set transparency to 50%
            showlegend=False  # Disable legend for this trace
        ))

    # Update layout for interactivity and appearance
    fig.update_layout(
        title=f"{item} Sales Over Time",
        xaxis_title="Date",
        yaxis_title="Sales",
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True),
        template="plotly_white",
        hovermode="x unified",  # Show tooltips for all traces at the same x-coordinate
        # hovermode="x",
        hoverdistance=1,
        spikedistance=1,
        showlegend=False  # Disable legend for the entire figure
    )

    # Show the interactive plot
    fig.show()


Interesting findings: 

- School sales has strong seasonality. And there's a trend for huge sale increase this year. 
- There's a peak for lingerie during earth quake. Not sure what happended. 

## Other topics:

How did black friday affect the sales? 

In [None]:
holidays[holidays.description == 'Black Friday']

Notice Black Friday is only introduced at 2014. 
I expect the sales might change over years. 

In [None]:
df_blackfriday = df[df.hol_event_name == 'Black Friday']

In [None]:
# Initiate the df 
all_items = df['family'].unique()
df_sale_bf = pd.DataFrame({'family': all_items})

for year in df_blackfriday.year.unique():
    for item in all_items:
        sales_bf = df_blackfriday.loc[(df_blackfriday.family == item) & (df_blackfriday.year == year)& (df_blackfriday.sales != 0), 'sales'].mean()
        sales_non_holiday = df_non_hol.loc[(df.family == item)  &  (df.sales != 0) , 'sales'].mean()

        df_sale_bf.loc[df_sale_bf['family'] == item, year] = (sales_bf / sales_non_holiday)


In [None]:


df_sale_bf.set_index('family', inplace=True)  # If run this cell again, comment this line. 


plt.figure(figsize=(12, 8))
sns.heatmap(df_sale_bf, annot=True, cmap='coolwarm', fmt=".2f",vmin = -1, vmax = 3)
plt.title("Ratio of Sales During BlackFriday vs Non Holidays (Over the years)", fontsize=16)
plt.xlabel("Holiday")
plt.ylabel("Family")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()

Not much sales change.

Black Friday didn't change the sales too much. Grocery might not be the main shopping choice for black friday. 