In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# loading data
train_df = pd.read_csv('DATASETS/train.csv', low_memory=False)
store_df = pd.read_csv('DATASETS/store.csv', low_memory=False)

In [None]:
# merging the data and displaying starting types of all column data
df = pd.merge(train_df, store_df, on='Store', how='left')
df.dtypes

In [None]:
# getting number of rows and columns
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

In [None]:
# label encode all the following columns
cols = ['StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']
for col in cols:
    df[col] = df[col].astype(str)
    df[col] = LabelEncoder().fit_transform(df[col])

In [None]:
# converting string numbers into just numbers
cols = ['Open', 'Promo', 'Promo2', 'SchoolHoliday']
for col in cols:
    df[col] = df[col].astype(int)

In [None]:
df.dtypes

In [None]:
df.head()

# Hypothesis Testing

## Conclusion 1: Stores with ongoing promotions have higher sales.

**Statistical method**: Two-sample t-test

**Null (H0)**: The mean sales for stores with Promo = 0 is equal to the mean sales for stores with Promo = 1.

**Alternative (Ha)**: The mean sales for stores with Promo = 1 is greater than that of stores with Promo = 0.

In [None]:
promo_df = df[df["Promo"] == 1]
nopromo_df = df[df["Promo"] == 0]

pval = ttest_ind(promo_df["Sales"], nopromo_df["Sales"], equal_var=False, alternative='greater').pvalue
print(f"P-value = {pval}")

We have found that the p-value is **effectively 0**. There is very strong evidence that sales are higher with promotions compared to without promotions. We are able to reject the null hypothesis. We are able to see this in the plot below, promotions tend to result in a higher median and overall sales.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="Promo", y="Sales", data=df)
plt.title("Sales Distribution: Promo vs No Promo", fontsize=14)
plt.xlabel("Promo (0 = No, 1 = Yes)", fontsize=12)
plt.ylabel("Sales", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

## Conclusion 2: Sales amounts are strongly correlated with customer amounts

**Statistical method**: Correlation analysis (Pearson)

In [None]:
corr = df["Sales"].corr(df["Customers"])
print(f"Pearson Correlation Coefficient: {corr}")

We have found a correlation coefficient of 0.895, which is very close to 1. This indicates a strong positive linear relationship between the number of customers and total sales. We can interpret this as customers increase, sales tend to increase as well. This is also supported by the accompanying scatterplot.

In [None]:
plt.figure(figsize=(10, 6))
sns.regplot(x="Customers", y="Sales", data=df)
plt.title("Sales vs. Number of Customers", fontsize=16)
plt.xlabel("Number of Customers")
plt.ylabel("Sales")
plt.grid(True, linestyle="--", alpha=0.5)
plt.show()

## Conclusion 3: Store type has a significant impact on average sales

**Statistical method**: One-Way Anova

**Null Hypothesis (HO)**: There is no significant difference in mean daily sales across different store types.  
**Alternative Hypothesis(Ha)**: At least one store type has a significantly different mean daily sales.

In [None]:
sales_type = []
for store_type, group in df.groupby('StoreType'):
    sales_type.append(group['Sales'].dropna())

res = f_oneway(*sales_type)
p_value = res.pvalue
print(f"P-value: {p_value}")

We have found that the p-value is **effectively 0**. There is very strong evidence that store type has a significant effect on average sales, meaning that at least one store type differs meaningfully from the others for daily sales performance. 

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='StoreType', y='Sales', data=df, errorbar=None)
plt.title("Mean Sales by Store Type", fontsize=16)
plt.xlabel("Store Type")
plt.ylabel("Mean Sales")
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

Let's run a Post-Hoc Analysis using **Tukey's HSD on Store Type**

In [None]:
# running turkey's hsd + printing summary
turkey = pairwise_tukeyhsd(endog=df['Sales'], groups=df['StoreType'], alpha=0.05)
print(turkey.summary())

From before our label encoder mapped a,b,c,d to 0,1,2,3 respectively. So this tells us Store Type b has significantly higher sales than all other types. Store Types a and c are statistically similar in sales performance. Store d, when compared to the rest of the stores, underperforms.