## Retail Data Analysis

## André de Paula Galhardo

### andregalhardo@tuta.io

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_features = pd.read_csv('Features data set.csv')


In [None]:
df_sales = pd.read_csv('sales data-set.csv')

In [None]:
df_stores = pd.read_csv('stores data-set.csv')

In [None]:
df_features.head()

In [None]:
df_sales = df_sales.merge(df_features, on=['Date', 'Store', 'IsHoliday'])

In [None]:
df_stores.head()

In [None]:
df_sales = df_sales.merge(df_stores, on='Store')

In [None]:
df_sales.head()

## 1- Data Exploring

In [None]:
df = df_sales

In [None]:
df.size

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

## 2 - Data Cleaning

In [None]:
df.fillna(0)

In [None]:
df.duplicated().sum()

## 3 - Data Transforming

In [None]:
df.info()

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

In [None]:
df.info()

## 4 - Data Analysis

In [None]:
df.corr(numeric_only=True)

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(numeric_only=True),annot=True)

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(12,6))
sns.set_style('whitegrid')
sns.barplot(x=df['Store'], y=df['Weekly_Sales'], ci=False)
#plt.xticks(rotation=45)
plt.title("Weekly Sales per Store")
plt.xlabel('Store')
plt.ylabel('Weekly Sales')

### Top 10 Stores with more Sales

In [None]:
top10 = df.groupby('Store')['Weekly_Sales'].mean().nlargest(10)
top10 = pd.DataFrame(top10)
top10

### Fuel Price and Weekly Sales

In [None]:
df.Fuel_Price.describe()

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(x='Fuel_Price', y='Weekly_Sales', data=df)
plt.title('Fuel Price and Weekly Sales')
plt.xlabel('Fuel Price')
plt.ylabel('Weekly Sales')

### Weekly Sales and Temperature

In [None]:
df.Temperature.describe()

In [None]:
sns.regplot(x=df['Temperature'], y=df['Weekly_Sales'])

### Weekly Sales and Unemployment

In [None]:
df.Unemployment.describe()

In [None]:
sns.regplot(x=df['Unemployment'], y=df['Weekly_Sales'])

### Weekly Sales per Date

In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(x=df['Date'], y=df['Weekly_Sales'])
plt.title('Weekly Sales per Date')

In [None]:
top10_sales = df.groupby('Date')['Weekly_Sales'].mean().nlargest(10)
top10_sales = pd.DataFrame(top10_sales)
top10_sales

In [None]:
df['Month'] = df['Date'].dt.month

In [None]:
sns.barplot(x='Month', y='Weekly_Sales', data=df)


### Comparative Holidays and non holidays

In [None]:
df_holydays = df[df['IsHoliday'] == 1]

In [None]:
df_holydays['Weekly_Sales'].mean()

In [None]:
df_holydays['Weekly_Sales'].describe()

In [None]:
df_non_holidays = df[df['IsHoliday'] == 0]

In [None]:
df_non_holidays['Weekly_Sales'].mean()

In [None]:
df_non_holidays['Weekly_Sales'].describe()

In [None]:
plt.figure(figsize=(10,6))
sns.set_style('whitegrid')
sns.barplot(x=df_holydays['Date'], y=df_holydays['Weekly_Sales'], ci=False)
plt.xticks(rotation=90)
plt.title("Weekly Sales per Holidays")
plt.xlabel('Holidays')
plt.ylabel('Weekly Sales')

In [None]:
plt.figure(figsize=(10,6))
sns.set_style('whitegrid')
sns.lineplot(x=df_non_holidays['Date'], y=df_non_holidays['Weekly_Sales'])
plt.xticks(rotation=90)
plt.title("Weekly Sales non Holidays")
plt.xlabel('Holidays')
plt.ylabel('Weekly Sales')

## Mean of Stores and Types

In [None]:
df_stores_total = df.groupby('Weekly_Sales')['Store'].mean()
df_stores_total = df_stores_total.value_counts()
df_stores_total = pd.DataFrame(df_stores_total)


In [None]:
df_stores_total = df_stores_total.rename(columns={'count': 'mean'})


In [None]:
df_stores_total = df_stores_total.merge(df, on='Store')
df_stores_total = df_stores_total[['Store', 'mean', 'Type']]

In [None]:
ax = sns.scatterplot(x='mean', y='Store', hue='Type', data=df_stores_total)
ax.set_xlim(1000, 10000)
plt.title('Mean of Weekly Sales per Type of Store')
plt.xlabel('Mean')

In [None]:
ax = sns.scatterplot(x='Weekly_Sales', y='Date', hue='Type', data=df)
ax.set_xlim(0, 250000)
plt.title('Weekly Sales per Type of Store')
plt.xlabel('Weekly Sales')


In [None]:
sns.lineplot(x='Size', y='Weekly_Sales', data=df)
plt.title('Weekly Sales per Size of Store')
plt.xlabel('Size of Store')