In [135]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings("ignore")
import holidays
import statsmodels as sm
from scipy.stats import ttest_ind
from datetime import timedelta, datetime
palette1 = ["efefef","3454d1","34d1bf","00a6fb","d1345b"]

In [136]:
df = pd.read_csv("assignment4.1a.csv", parse_dates=["Date"])
df2 = pd.read_csv("assignment4.1b.csv",parse_dates=["Date"])
df3 = pd.read_csv("assignment4.1c.csv")
df_promo = pd.read_csv("PromotionDates.csv",parse_dates=["StartDate","EndDate"], dayfirst=False)
df_promo.replace({"1/9/2015":"9/1/2015",
             "6/9/2015":"9/6/2015",
             "20/11/2015":"11/20/2015",
             "27/11/2015":"11/27/2015"},inplace=True)

In [137]:
tr_hol = holidays.country_holidays("TR")

promo_dates = []
for i in df_promo.Period.tolist():
    sd = pd.to_datetime(df_promo[df_promo.Period == i]["StartDate"].values[0])
    ed = pd.to_datetime(df_promo[df_promo.Period == i]["EndDate"].values[0])
    promo_dates.extend(pd.date_range(start = sd,end = ed))

ff_promo_dates = []
for i in df_promo.Period.tolist()[:4]:
    sd = pd.to_datetime(df_promo[df_promo.Period == i]["StartDate"].values[0])
    ed = pd.to_datetime(df_promo[df_promo.Period == i]["EndDate"].values[0])
    promo_dates.extend(pd.date_range(start = sd,end = ed))

first_weekend_after_promo = [x+timedelta(days=7) for x in promo_dates if x.dayofweek in [5,6]]

sd= df.Date.min()
ed= df.Date.max()

first_data_range = pd.date_range(start=sd,end=ed)
non_promo_dates = [x for x in first_data_range if x not in promo_dates]
non_promo_data = df[df.Date.isin(non_promo_dates)]
promo_data = df[~df.Date.isin(non_promo_dates)]
non_promo_data.Date.nunique(),promo_data.Date.nunique()


(179, 33)

In [138]:
def label(data = non_promo_data,criterium= "StoreCode"):
  groups = non_promo_data.groupby(criterium)["SalesQuantity"].sum().reset_index()
  groups[[criterium,"SalesQuantity"]]
  low = groups.SalesQuantity.quantile(0.33)
  if criterium=="StoreCode":
    high = groups.SalesQuantity.quantile(0.66)
  else:
    high = groups.SalesQuantity.quantile(0.66)
  groups[f"{criterium}_labels"] = pd.cut(groups["SalesQuantity"],bins = [-np.inf,low,high,np.inf],labels = ["Slow","Medium","Fast"])
  return groups[[criterium,f"{criterium}_labels"]]
store_groups = label()
product_groups = label(data=non_promo_data,criterium="ProductCode")

In [139]:
merged = pd.merge(df,df3, how = "left", on = "ProductCode")
merged.head(3)

Unnamed: 0,Date,StoreCode,ProductCode,SalesQuantity,ProductGroup1,ProductGroup2
0,2015-01-01,8,9,-1,H,15
1,2015-01-01,131,9,1,H,15
2,2015-01-01,144,9,2,H,15


In [140]:
merged = pd.merge(left = merged, right = store_groups, on="StoreCode",how='left')
merged = pd.merge(left = merged, right = product_groups, on="ProductCode",how='left')
merged.dropna(inplace=True)
merged.head(3)

Unnamed: 0,Date,StoreCode,ProductCode,SalesQuantity,ProductGroup1,ProductGroup2,StoreCode_labels,ProductCode_labels
0,2015-01-01,8,9,-1,H,15,Medium,Slow
1,2015-01-01,131,9,1,H,15,Fast,Slow
2,2015-01-01,144,9,2,H,15,Slow,Slow


In [141]:
product_types = {}
for pcode in merged.ProductCode_labels.unique():
    pcodes = merged[merged.ProductCode_labels == pcode]["ProductCode"].unique()
    product_types[pcode] = pcodes

store_types = {}
for scode in merged.StoreCode_labels.unique():
    scodes = merged[merged.StoreCode_labels == scode]["StoreCode"].unique()
    store_types[scode] = scodes


#### Q1-c) Which items experienced the biggest sale increase during promotions?

In [142]:
import math
promotion_weeks = math.ceil(promo_data.Date.nunique()/7)
non_promotion_weeks = math.ceil(non_promo_data.Date.nunique()/7)
promotion_weeks, non_promotion_weeks

(5, 26)

In [143]:
def format_decimals(x):
    return "{:.3f}".format(x)
def compare_groups(container,code):
    avges = {}
    for subgroup in container.keys():
        promotion_avg_sales = promo_data[promo_data[code].isin(container[subgroup])]["SalesQuantity"].sum() / promotion_weeks     
        nonpromotion_avg_sales = non_promo_data[non_promo_data[code].isin(container[subgroup])]["SalesQuantity"].sum() / non_promotion_weeks
        avges[f'{subgroup} {code[:-4]}s'] = [promotion_avg_sales,nonpromotion_avg_sales]


    comparison = pd.DataFrame(avges,index = ["Promotion_avg","Nonpromotion_avg"]).T
    comparison["PrcntChange"] = ((comparison.Promotion_avg  / comparison.Nonpromotion_avg) - 1) * 100
    comparison = comparison[["Nonpromotion_avg","Promotion_avg","PrcntChange"]]
    comparison = comparison.style.highlight_max("PrcntChange",color = "green", axis = 0).format(format_decimals)
    return comparison 
compare_groups(product_types,"ProductCode")

Unnamed: 0,Nonpromotion_avg,Promotion_avg,PrcntChange
Slow Products,1039.654,1213.2,16.693
Medium Products,17852.5,19900.0,11.469
Fast Products,111856.423,140860.2,25.929


#### Q1-d) Are there stores that have higher promotion reactions?

In [144]:
compare_groups(store_types,"StoreCode")

Unnamed: 0,Nonpromotion_avg,Promotion_avg,PrcntChange
Medium Stores,35268.077,43183.8,22.444
Fast Stores,73537.077,92123.8,25.275
Slow Stores,21943.423,26665.8,21.521


#### Q1-f)  Is there any significant difference between promotion impacts of the Fast versus Slow items?
#### Q1-g)  Is there any significant difference between promotion impacts of the Fast versus Slow stores? 

In [145]:
def is_it_significant(container, subgroups, code):
    # The function is defined with three parameters: container (presumably a dictionary or data container),
    # subgroups (a list of subgroup names), and code (a string representing the code or identifier).

    changes = {}
    # Create an empty dictionary to store the changes in sales for each subgroup.

    for subgroup in subgroups:
        # Loop through each subgroup specified in the 'subgroups' list.

        records = []
        # Create an empty list to store the changes in sales for each subgroup.

        for i in container[subgroup]:
            # Loop through each element 'i' in the 'container' dictionary under the current 'subgroup'.

            pr_vl = promo_data[promo_data[code] == i]["SalesQuantity"].values.sum() / promotion_weeks
            # Calculate the total sales quantity (pr_vl) of promotions for the current 'i'.

            nonpr_vl = non_promo_data[non_promo_data[code] == i]["SalesQuantity"].values.sum() / non_promotion_weeks
            # Calculate the total sales quantity (nonpr_vl) of non-promotion days for the current 'i'.

            change = ((pr_vl / nonpr_vl) - 1) * 100
            # Calculate the percentage change in sales due to promotions compared to non-promotion days.

            if math.isnan(change):
                pass
                # If the 'change' value is NaN (due to division by zero or other invalid operations), ignore it.
            else:
                records.append(change)
                # If the 'change' value is valid (not NaN), add it to the 'records' list.

        changes[subgroup] = records
        # Store the 'records' list of changes in sales for the current 'subgroup' in the 'changes' dictionary.

    # The loop ends here, and we have calculated the changes in sales for each subgroup.

    rez = ttest_ind(changes["Fast"], changes["Slow"])
    # Perform a t-test to check if there is a significant difference
    # between the sales changes in the "Fast" and "Slow" subgroups.

    print(f"Is there a significant difference in the effect of promotions on {subgroups[0]} and {subgroups[1]} {code[:-4]}s?")
    # Print a message indicating what subgroups are being compared (e.g., "Fast" and "Slow") and the 'code'.

    if rez[1] < 0.05:
        print("Yes")
        # If the p-value from the t-test is less than 0.05 (indicating statistical significance),
        # print "Yes" to indicate there is a significant difference.
    else:
        print("No")
        # Otherwise, print "No" to indicate there is no significant difference.

    print("p value: {:.2g}".format(rez[1]))
    # Print the p-value in scientific notation with two significant figures.

    return rez[1] < 0.05
    # Return a boolean value indicating whether the p-value is less than 0.05 (True if significant, False otherwise).

# Call the function for product_types and store_types with "Fast" and "Slow" subgroups and respective codes.
is_it_significant(product_types, ["Fast", "Slow"], "ProductCode")
is_it_significant(store_types, ["Fast", "Slow"], "StoreCode")


Is there a significant difference in the effect of promotions on Fast and Slow Products?
Yes
p value: 0.032
Is there a significant difference in the effect of promotions on Fast and Slow Stores?
Yes
p value: 0.027


True