In [None]:
pip install plotly

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# Data Cleaning

In [None]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
df_store = pd.read_csv('data/store.csv')

In [None]:
df_train.head()

In [None]:
# Merge df_store and df_train
df = df_train.merge(df_store, how='left', left_on=df_train.Store, right_on=df_store.Store)
df.drop(['key_0', 'Store_y'], axis=1, inplace=True)
df = df.rename(columns={'Store_x':'Store'})
df.shape

In [None]:
df.head()

In [None]:
cols = df.columns.tolist()
cols = [col.lower() for col in cols]
df.columns = cols

In [None]:
df.info()

In [None]:
df = df.assign(
    timestamp = lambda x: pd.to_datetime(x['date']),
    year = lambda x: x['timestamp'].dt.year,
    month = lambda x: x['timestamp'].dt.month,
    day = lambda x: x['timestamp'].dt.day,
    dayofyear = lambda x: x['timestamp'].dt.dayofyear
)

In [None]:
df.head()

In [None]:
df['stateholiday'].unique()

In [None]:
df.stateholiday.value_counts()

In [None]:
df['stateholiday'].replace({0:'0'}, inplace=True)

In [None]:
df.stateholiday.value_counts()

In [None]:
round(df.describe().T,2)

### Handling NaN's

In [None]:
df.isnull().sum()

In [None]:
df_store[pd.isnull(df_store.CompetitionDistance)]

In [None]:
# fill NaN with a median value
df['competitiondistance'].fillna(df['competitiondistance'].median(), inplace = True)
df['competitiondistance'].isnull().sum()

In [None]:
tmp = df_store[pd.isnull(df_store.CompetitionOpenSinceYear  )]
tmp[tmp.CompetitionDistance != 0].shape

In [None]:
tmp = df[pd.isnull(df.promo2sinceweek)]
tmp[tmp.promo2 != 0].shape

In [None]:
# replace NA's by 0
df.fillna(0, inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df_test.isnull().sum()

In [None]:
df_test[df_test["Open"].isnull()]

In [None]:
df_test.fillna(1, inplace=True)

In [None]:
df_test.isnull().sum()

### Handling Datatyp

In [None]:
df_test["Open"]= df_test["Open"].astype(int)

In [None]:
df_test.info()

### Handling Categorical Data

In [None]:
le = LabelEncoder()

In [None]:
ass_typ = pd.get_dummies(df["assortment"], drop_first=True)
df = pd.concat([df,ass_typ], axis=1)


In [None]:
df["storetype_cat"] = le.fit_transform(df["storetype"])

In [None]:
df["stateholiday_cat"] = le.fit_transform(df["stateholiday"])

In [None]:
df =df.rename(columns={"b" : "ass_extra"})

In [None]:
df =df.rename(columns={"c" : "ass_extended"})

In [None]:
df["stateholiday_cat"].unique()

In [None]:
df.info()

In [None]:
df.stateholiday.value_counts()

In [None]:
df.stateholiday_cat.value_counts()

# Eda

## Question: Are the Promos effective?

In [None]:
df.groupby('storetype')['sales'].describe()

In [None]:
df.groupby('storetype')['customers', 'sales'].sum()

In [None]:
# sales trends
sns.factorplot(data = df, x = 'month', y = "sales", 
               col = 'storetype',
               palette = 'plasma',
               hue = 'storetype',
               row = 'promo', 
               ) 

## Finding

Storetype B has the highest sales numbers, with the largest variance. All storetypes show increased sales numbers towards christmas. 

Stores which have run a promo, show higher sales. But storetypes a,c and d show a dip towards easter, if they have run a promo, which is not the case for stores without a promo.

In [None]:
# customers trends
sns.factorplot(data = df, x = 'month', y = "customers", 
               col = 'storetype',
               palette = 'plasma',
               hue = 'storetype',
               row = 'promo',
               ) 

## Finding

Storetype B has the highest number of customers, with the largest variance. All storetypes show an increase of customers towards christmas. This trend is higher, if they have run a promo.

Same effect of a dip for storetypes a,c and d in customers towards easter can be also be seen here.

In [None]:
# sale per customer trends
df['salepercustomer'] = df['sales']/df['customers']
sns.factorplot(data = df, x = 'month', y = "salepercustomer", 
               col = 'storetype',
               palette = 'plasma',
               hue = 'storetype',
               row = 'promo', 
               ) 

## Finding

Sales per customer:
storetype b seems to be where customers only buy small items in low numbers (possible trainstation location?)
storetype d customers buy the largest quantity
a und c are very similar

In [None]:
# weekday trends
sns.factorplot(data = df, x = 'dayofweek', y = "customers", 
               col = 'storetype',
               palette = 'plasma',
               hue = 'storetype',
               row = 'promo',
               ) 

## Finding

Promos are run only during the work-week, no promo on saturday/sunday.

Storetype b also open on sundays -> trainstation, fo sho
storetyp a lower number of customers on saturday, c and d increased

In [None]:
# weekday trends
sns.factorplot(data = df, x = 'dayofweek', y = "sales", 
               col = 'storetype',
               palette = 'plasma',
               hue = 'storetype',
               row = 'promo',
               ) 

Similar trends regarding sales numbers and customers.
Highest number of sales and customers on mondays, if a promo was run

## Conclusion : Question: Are the Promos effective?"
- Promos are run only during the work-week, no promo on saturday/sunday\n",
- Storetype B has the highest number of customers, with the largest variance\n",
- Storetype B has the highest sales numbers, with the largest variance\n",
- All storetypes show increased sales numbers towards christmas\n",
- Stores which have run a promo, show higher sales. But storetypes a, c and d show a dip towards easter, if they have run a promo, which is not the case for stores without a promo."

In [None]:
## beachte: date, (assortment, storetype und stateholiday,assortment) müssen noch raus. die in der klammer sind schon durch kategorische Variablen ersetzt

In [None]:
sns.factorplot(data = df, x = 'month', y = "sales", 
               col = 'dayofweek', 
               palette = 'plasma',
               hue = 'storetype',
               row = 'storetype',
               ) 

This shows the sales per storetyp across all months for each weekday.

In [None]:
# stores which are opened on Sundays
df[(df.open == 1) & (df.dayofweek == 7)]['store'].unique()

These are the stores, which are open on sundays.

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(18,8))
sns.lineplot(x = "date", y = df.groupby(df.index).mean()["competitiondistance"] , data=df, ax=axes[0])
sns.lineplot(x = "date", y = df.groupby(df.index).mean()["competitionopensincemonth"]  , data=df, ax=axes[1])

axes[0].set_title("Date vs Competiton Distance (on average)")
axes[1].set_title("Date vs Competiton Open Since Month (on average)")
plt.show()

In [None]:
temp_df = df.copy()
temp_df.index = temp_df.date
temp_df = temp_df.groupby(temp_df.index).mean()

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18,8))
sns.lineplot(x = "date", y = "competitiondistance" , data=temp_df, ax=axes[0])
sns.lineplot(x = "date", y = "competitionopensincemonth"  , data=df, ax=axes[1])

axes[0].set_title("Date vs Competiton Distance (on average)")
axes[1].set_title("Date vs Competiton Open Since Month (on average)")
plt.show()



In [None]:
df_store.describe().T

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(18,8))
sns.lineplot(x = "Store", y = "CompetitionDistance" , data=df_store, ax=axes[0])
sns.lineplot(x = "Store", y = "CompetitionOpenSinceMonth"  , data=df_store, ax=axes[1])

In [None]:
sns.set_style("dark")
sns.histplot(data=df_store, x="CompetitionDistance")


In [None]:
sns.countplot(x="StoreType",hue="Assortment",order=["a","b","c","d"], 
                                     data=df_store,palette="rocket_r").set_title("Number of Different Assortments per Store Type")

In [None]:
df.columns

In [None]:
df['assperstoretype'] = df['assortment'] + df['storetype']

In [None]:
temp = df.groupby('assperstoretype')['sales'].agg(['mean'])
temp

In [None]:
#fig2 = px.scatter(df, x='store', y='sales', color='assperstoretype')
#fig2.show()

In [None]:
#fig3 = px.scatter(df, x='store', y='sales', color='extraperstoretype')
#fig3.show()

In [None]:
df['extendedperstoretype'] = (df['ass_extended']==1).astype(str) + '_' + df['storetype']

In [None]:
#fig4 = px.scatter(df, x='store', y='sales', color='extendedperstoretype')
#fig4.show()

In [None]:
#fig2.write_html("images/sales_store.html")
#fig3.write_html("images/sales_extraperstoretype.html")
#fig4.write_html("images/sales_extendedperstoretype.html")