# Import the required libraries

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats 
from termcolor import colored 
import urllib.parse 
from sqlalchemy import create_engine 
import pylab 

In [2]:
df=pd.read_csv("./data/retail_store_inventory.csv")
df

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Holiday/Promotion,Competitor Pricing,Seasonality
0,2022-01-01,S001,P0001,Groceries,North,231,127,55,135.47,33.50,20,Rainy,0,29.69,Autumn
1,2022-01-01,S001,P0002,Toys,South,204,150,66,144.04,63.01,20,Sunny,0,66.16,Autumn
2,2022-01-01,S001,P0003,Toys,West,102,65,51,74.02,27.99,10,Sunny,1,31.32,Summer
3,2022-01-01,S001,P0004,Toys,North,469,61,164,62.18,32.72,10,Cloudy,1,34.74,Autumn
4,2022-01-01,S001,P0005,Electronics,East,166,14,135,9.26,73.64,0,Sunny,0,68.95,Summer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73095,2024-01-01,S005,P0016,Furniture,East,96,8,127,18.46,73.73,20,Snowy,0,72.45,Winter
73096,2024-01-01,S005,P0017,Toys,North,313,51,101,48.43,82.57,10,Cloudy,0,83.78,Autumn
73097,2024-01-01,S005,P0018,Clothing,West,278,36,151,39.65,11.11,10,Rainy,0,10.91,Winter
73098,2024-01-01,S005,P0019,Toys,East,374,264,21,270.52,53.14,20,Rainy,0,55.80,Spring


# Data Prepocessing 

In [3]:
df.shape 

(73100, 15)

In [4]:
df.dtypes

Date                   object
Store ID               object
Product ID             object
Category               object
Region                 object
Inventory Level         int64
Units Sold              int64
Units Ordered           int64
Demand Forecast       float64
Price                 float64
Discount                int64
Weather Condition      object
Holiday/Promotion       int64
Competitor Pricing    float64
Seasonality            object
dtype: object

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Date                73100 non-null  object 
 1   Store ID            73100 non-null  object 
 2   Product ID          73100 non-null  object 
 3   Category            73100 non-null  object 
 4   Region              73100 non-null  object 
 5   Inventory Level     73100 non-null  int64  
 6   Units Sold          73100 non-null  int64  
 7   Units Ordered       73100 non-null  int64  
 8   Demand Forecast     73100 non-null  float64
 9   Price               73100 non-null  float64
 10  Discount            73100 non-null  int64  
 11  Weather Condition   73100 non-null  object 
 12  Holiday/Promotion   73100 non-null  int64  
 13  Competitor Pricing  73100 non-null  float64
 14  Seasonality         73100 non-null  object 
dtypes: float64(3), int64(5), object(7)
memory usage: 8.4+

#### Descriptive Statistics of Numerical Columns 

In [6]:
print(colored("Descriptive Statistics of Numerical Columns :\n\n",'blue',attrs=['bold']))
df.describe()

[1m[34mDescriptive Statistics of Numerical Columns :

[0m


Unnamed: 0,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Holiday/Promotion,Competitor Pricing
count,73100.0,73100.0,73100.0,73100.0,73100.0,73100.0,73100.0,73100.0
mean,274.469877,136.46487,110.004473,141.49472,55.135108,10.009508,0.497305,55.146077
std,129.949514,108.919406,52.277448,109.254076,26.021945,7.083746,0.499996,26.191408
min,50.0,0.0,20.0,-9.99,10.0,0.0,0.0,5.03
25%,162.0,49.0,65.0,53.67,32.65,5.0,0.0,32.68
50%,273.0,107.0,110.0,113.015,55.05,10.0,0.0,55.01
75%,387.0,203.0,155.0,208.0525,77.86,15.0,1.0,77.82
max,500.0,499.0,200.0,518.55,100.0,20.0,1.0,104.94


#### Descriptive Statistics of Categorical Columns: 

In [7]:
print(colored("Descriptive Statistics of Categorical Columns :\n\n",'blue',attrs=['bold']),df.describe(include=object).T)

[1m[34mDescriptive Statistics of Categorical Columns :

[0m                    count unique         top   freq
Date               73100    731  2022-01-01    100
Store ID           73100      5        S001  14620
Product ID         73100     20       P0001   3655
Category           73100      5   Furniture  14699
Region             73100      4        East  18349
Weather Condition  73100      4       Sunny  18290
Seasonality        73100      4      Spring  18317


#### Checking the Number of Duplicates in the Dataset

In [8]:
duplicate_values=df.duplicated(subset=None,keep='first').sum()
print(' Number of Duplicate values: ',duplicate_values)

 Number of Duplicate values:  0


#### Checking the Number and Percentage of Missing Values in the Dataset

In [9]:
missing=df.isna().sum().sort_values(ascending=False)
print(colored("Number of Missing Values\n\n",'blue',attrs=['bold']),missing)

[1m[34mNumber of Missing Values

[0m Date                  0
Store ID              0
Product ID            0
Category              0
Region                0
Inventory Level       0
Units Sold            0
Units Ordered         0
Demand Forecast       0
Price                 0
Discount              0
Weather Condition     0
Holiday/Promotion     0
Competitor Pricing    0
Seasonality           0
dtype: int64


In [10]:
print(colored('Number of Unique Values:\n\n','blue',attrs=['bold']),df.nunique())

[1m[34mNumber of Unique Values:

[0m Date                    731
Store ID                  5
Product ID               20
Category                  5
Region                    4
Inventory Level         451
Units Sold              498
Units Ordered           181
Demand Forecast       31608
Price                  8999
Discount                  5
Weather Condition         4
Holiday/Promotion         2
Competitor Pricing     9751
Seasonality               4
dtype: int64


In [12]:
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.strftime('%b')
df 

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Holiday/Promotion,Competitor Pricing,Seasonality,Month
0,2022-01-01,S001,P0001,Groceries,North,231,127,55,135.47,33.50,20,Rainy,0,29.69,Autumn,Jan
1,2022-01-01,S001,P0002,Toys,South,204,150,66,144.04,63.01,20,Sunny,0,66.16,Autumn,Jan
2,2022-01-01,S001,P0003,Toys,West,102,65,51,74.02,27.99,10,Sunny,1,31.32,Summer,Jan
3,2022-01-01,S001,P0004,Toys,North,469,61,164,62.18,32.72,10,Cloudy,1,34.74,Autumn,Jan
4,2022-01-01,S001,P0005,Electronics,East,166,14,135,9.26,73.64,0,Sunny,0,68.95,Summer,Jan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73095,2024-01-01,S005,P0016,Furniture,East,96,8,127,18.46,73.73,20,Snowy,0,72.45,Winter,Jan
73096,2024-01-01,S005,P0017,Toys,North,313,51,101,48.43,82.57,10,Cloudy,0,83.78,Autumn,Jan
73097,2024-01-01,S005,P0018,Clothing,West,278,36,151,39.65,11.11,10,Rainy,0,10.91,Winter,Jan
73098,2024-01-01,S005,P0019,Toys,East,374,264,21,270.52,53.14,20,Rainy,0,55.80,Spring,Jan


## Exploratory Data Analysis

### First Moment Business Decision or Measures of Central Tendency.
The first moment in business decision-making, often referred to as the mean or expected value, provides a critical measure of central tendency. It offers valuable insights into the anticipated outcomes, allowing organizations to assess risks, allocate resources, and optimize strategies based on a reliable benchmark. By focusing on the first moment, companies can make more informed choices to enhance efficiency, profitability, and overall performance.

- Provides mean, median, and mode dataset statistics.
- Mean represents the data's average, sensitive to outliers.
- Median signifies the dataset's central value.
- Mode identifies the most frequently occurring value.
- If mean, median, and mode align, the data exhibits a normal distribution; otherwise, it deviates from normality.

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                73100 non-null  datetime64[ns]
 1   Store ID            73100 non-null  object        
 2   Product ID          73100 non-null  object        
 3   Category            73100 non-null  object        
 4   Region              73100 non-null  object        
 5   Inventory Level     73100 non-null  int64         
 6   Units Sold          73100 non-null  int64         
 7   Units Ordered       73100 non-null  int64         
 8   Demand Forecast     73100 non-null  float64       
 9   Price               73100 non-null  float64       
 10  Discount            73100 non-null  int64         
 11  Weather Condition   73100 non-null  object        
 12  Holiday/Promotion   73100 non-null  int64         
 13  Competitor Pricing  73100 non-null  float64   

In [14]:
df[["Inventory Level", "Units Sold", "Units Ordered", "Demand Forecast", "Price", "Discount", "Competitor Pricing"]].mean()

Inventory Level       274.469877
Units Sold            136.464870
Units Ordered         110.004473
Demand Forecast       141.494720
Price                  55.135108
Discount               10.009508
Competitor Pricing     55.146077
dtype: float64

In [16]:
df[["Inventory Level", "Units Sold", "Units Ordered", "Demand Forecast", "Price", "Discount", "Competitor Pricing"]].median()

Inventory Level       273.000
Units Sold            107.000
Units Ordered         110.000
Demand Forecast       113.015
Price                  55.050
Discount               10.000
Competitor Pricing     55.010
dtype: float64

In [17]:
df.mode()

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Holiday/Promotion,Competitor Pricing,Seasonality,Month
0,2022-01-01,S001,P0001,Furniture,East,168.0,40.0,56.0,21.84,31.2,20.0,Sunny,0.0,37.00,Spring,Jan
1,2022-01-02,S002,P0002,,,,,,46.06,,,,,46.39,,
2,2022-01-03,S003,P0003,,,,,,,,,,,,,
3,2022-01-04,S004,P0004,,,,,,,,,,,,,
4,2022-01-05,S005,P0005,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,2023-12-28,,,,,,,,,,,,,,,
727,2023-12-29,,,,,,,,,,,,,,,
728,2023-12-30,,,,,,,,,,,,,,,
729,2023-12-31,,,,,,,,,,,,,,,


### Second Moment Business Decision or Measures of Dispersion
The Second Moment in business decision-making refers to assessing the variability or risk associated with a particular choice. It involves understanding the spread or dispersion of potential outcomes. Analyzing the second moment helps businesses make informed decisions by evaluating the range of possible results, which is crucial for risk management and resource allocation.

- Encompasses Variance, Standard Deviation, and Range.
- Offers insights into data dispersion within the dataset.
- Variance measures the mean of squared deviations from the mean.
- Standard Deviation is the square root of the Variance.
- Range quantifies the gap between the dataset's maximum and minimum values.

In [20]:
df[["Inventory Level", "Units Sold", "Units Ordered", "Demand Forecast", "Price", "Discount", "Competitor Pricing"]].var()

Inventory Level       16886.876218
Units Sold            11863.437048
Units Ordered          2732.931607
Demand Forecast       11936.453225
Price                   677.141602
Discount                 50.179461
Competitor Pricing      685.989846
dtype: float64