In [None]:
# import libraries
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns 

# display settings & code formatting
pd.options.display.max_columns = 999
%matplotlib inline
%load_ext nb_black

# project paths
project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

data_path = os.path.join(project_root_dir, "data")
os.makedirs(data_path, exist_ok=True)

image_path = os.path.join(project_root_dir, "images")
os.makedirs(image_path, exist_ok=True)

# function for loading data
def load_data(filename, data_path=data_path):
    csv_path = os.path.join(data_path, filename)
    return pd.read_csv(csv_path)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)


In [None]:
train = load_data("train.csv")
test = load_data("test.csv")
stores = load_data("stores.csv")
features = load_data("features.csv")
sample_submission = load_data("sampleSubmission.csv")

In [None]:
train.head()

In [None]:
print("Training Data shape: {}".format(train.shape))
print("Test Data shape: {}".format(test.shape))
print("Stores Data shape: {}".format(stores.shape))
print("Features Data shape: {}".format(features.shape))

## Distribution of weekly Sales

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=train["Weekly_Sales"]))
fig.update_layout(
    title="Weekly Sales at Walmart", xaxis_title="Weekly Sales", yaxis_title="Count"
)
fig.show()

We can see that the distribution is heavily right skewed which means there are many stores with low or medium weekly sales and few stores with very large sales. It might be due to the fact that, some stores are small in size and some are bigger. The locality of the store also matter. In smaller town we expect the sales to be lower compared to main places in urban areas. The density of the population at that area can also be a factor. In fact, the stores csv files contains information about the size of each stores. 

Let's also suppliment the histogram with the numerical summary to get a better understanding of this distribution.

In [None]:
train["Weekly_Sales"].describe()

The mean weekly sales is around $\$16000$ and have an standard deviation of $\$22700$ which is a lot. which again indicate that there is a lot of variabilty in weekly sales. The median sales also higlight this fact as it is around $\$7500$, which is even less than half of the mean value. The maximum sales is $\$693000$. And the minimum value is -$\$4988$, which might be due to more items/dollar amount were returned than sold. 

One better way to visualize this distribution is by taking the log of the weekly sales. Taking the log will make the distribution looks more normal.

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=np.log10(train["Weekly_Sales"])))
fig.update_layout(
    title="Log of Weekly Sales at Walmart",
    xaxis_title="Log base 10 of Weekly Sales",
    yaxis_title="Count",
)
fig.show()

In [None]:
np.log10(train["Weekly_Sales"]).describe()

You can see that we are geeting some "division by zero error" and also the mean is -inf and some nan value. It's because we can not take the log of 0 and negative numbers. In python and other statistical libraries like R, the log of 0 is -inf and negative numbers is nan. 

In [None]:
np.log10(0)

In [None]:
np.log(-2)

So, let's fix this error by only taking the log of values that are greater or equal to 1. 

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Histogram(x=np.log10(train[train["Weekly_Sales"] >= 1]["Weekly_Sales"]))
)
fig.update_layout(
    title="Log of Weekly Sales at Walmart",
    xaxis_title="Log base 10 of Weekly Sales",
    yaxis_title="Count",
)
fig.show()

In [None]:
np.log10(train[train["Weekly_Sales"] >= 1]["Weekly_Sales"]).describe()

We can see that after taking the log, the distribution mostly looks normal but also with a bit of negativly skewed. And  the mean and median is somewhere around 4. We can take the anti log of these values to get the values back in the original form.

In [None]:
print("Mean of weekly sales:", np.round(10 ** 3.7, 0))
print("Median of weekly sales:", np.round(10 ** 3.9, 0))

The median is still around same as before but the mean has reduced a lot from $\$16000$ to $\$5000$. Taking the log reduced the effects of extreme values. And the median is more than the mean is due to that negativly skewed distribution. So a better measure of central tendency for this weekly sales data is median as it is not affected by extreme values. 

## Merge the stores data

Let's meger the stores data with the training and test set and look at the weekly sales data by the store size. 

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train = pd.merge(train, stores, how="left", on="Store")
test = pd.merge(test, stores, how="left", on="Store")

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
train.isnull().sum()

## Distribution of weekly sales by store type

In [None]:
train["Type"].value_counts()

In [None]:
train["Type"].value_counts(normalize=True) * 100

51% of the data is from type A store, 39% from type B and 10% from type c. 

In [None]:
train.groupby("Type")["Size"].mean().round(0)

We can see that the size of the store A is bigger than B and store B is bigger than C.

In [None]:
median_sales_type = (
    train.groupby("Type")["Weekly_Sales"].median().reset_index().round(0)
)
median_sales_type

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=median_sales_type["Type"], y=median_sales_type["Weekly_Sales"]))
fig.update_layout(
    title="Median Weekly Sales BY Store Type",
    xaxis_title="Store Type",
    yaxis_title="Median Weekly Sales",
)
fig.show()

As we expect the bigger store has more sales than smaller stores. Now, let's also make a side by side boxplot which is best when we want to find a relationship between a categorical and a numerical feature. 

In [None]:
fig = px.box(train, x="Type", y="Weekly_Sales")
fig.show()

We can see that there are lots of outliers in the data. So, lets first create a new column which is the log base 10 of weekly sales as we might need it again later. 

In [None]:
# take log10 of weekly sales where sale >= 1 otherwise 0
train["log10_Weekly_Sales"] = np.where(
    train["Weekly_Sales"] >= 1, np.log10(train["Weekly_Sales"]), 0
)

In [None]:
fig = px.box(train, x="Type", y="log10_Weekly_Sales")
fig.show()

It clear from the above figure that there are more outliers in A and B type stores than the c type, which means that there are few A and B type stores which is peformaing way more poorly than rest of the stores in their group and the variability in weekly sales is much higher in C type stores than the type A and B stores. Let's isolate the outlier stores in A and B type stores.

In [None]:
# Type A And B Stores with very low weekly sales
train[(train["Type"].isin(["A", "B"])) & (train["log10_Weekly_Sales"] <= 2)]

We can dive more deeper into these data to understand what are the reason for this by combining these data with some other data or talk with someone who manages these stores to get some context behind the failure of these stores. 

Let's also make a scatter plot to see if we can find some important trend in the data.

In [None]:
fig = px.scatter(train, x="Size", y="log10_Weekly_Sales", color="Type", trendline="ols")
fig.show()

In [None]:
fig = px.scatter(train, x="Size", y="Weekly_Sales", color="Type", trendline="ols")
fig.show()

In [None]:
train[["Weekly_Sales", "Size"]].corr()

Based on the plot and the correlation matrix, we can see that there is a very weak correlation between the size of the stores and the weekly sales they generate. We can't say that if the size of the stores increase there sales also increase or decreases. Both are not related to each others. 