In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

DATA_PATH = Path("../../[DATASET C] Retail Supply Chain Sales Analysis/Retail-Supply-Chain-Sales-Analysis.xlsx")


data_path = DATA_PATH

## Load data

In [None]:
retail = pd.read_excel(data_path, sheet_name='Retails Order Full Dataset')
calendar = pd.read_excel(data_path, sheet_name='Calendar Table')
# Normalize column names
for df in [retail, calendar]:
    df.columns = df.columns.str.strip()         
    df.columns = df.columns.str.lower()         
    df.columns = df.columns.str.replace(' ', '_')  
    df.columns = df.columns.str.replace('-', '_')  

In [4]:
# Normalize datetime types
retail['order_date'] = pd.to_datetime(retail['order_date'])
retail['ship_date'] = pd.to_datetime(retail['ship_date'])
calendar['date'] = pd.to_datetime(calendar['date'])

## Exploring dataset

In [5]:
retail.describe()

Unnamed: 0,row_id,order_date,ship_date,postal_code,sales,quantity,discount,profit
count,9994.0,9994,9994,9994.0,9994.0,9994.0,9994.0,9994.0
mean,4997.5,2016-04-11 07:17:44.078447104,2016-05-15 21:54:47.332399360,55190.379428,229.858001,3.789574,0.156203,28.656896
min,1.0,2014-01-02 00:00:00,2014-01-15 00:00:00,1040.0,0.444,1.0,0.0,-6599.978
25%,2499.25,2015-05-01 00:00:00,2015-06-20 00:00:00,23223.0,17.28,2.0,0.0,1.72875
50%,4997.5,2016-05-30 00:00:00,2016-07-21 00:00:00,56430.5,54.49,3.0,0.2,8.6665
75%,7495.75,2017-04-09 00:00:00,2017-06-02 00:00:00,90008.0,209.94,5.0,0.2,29.364
max,9994.0,2017-12-30 00:00:00,2018-05-01 00:00:00,99301.0,22638.48,14.0,0.8,8399.976
std,2885.163629,,,32063.69335,623.245101,2.22511,0.206452,234.260108


Nhìn chung, các cột dữ liệu số như `quantity`, `discount` không có bất thường về logic nghiệp vụ

In [6]:
# Check for missing values
retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   row_id               9994 non-null   int64         
 1   order_id             9994 non-null   object        
 2   order_date           9994 non-null   datetime64[ns]
 3   ship_date            9994 non-null   datetime64[ns]
 4   ship_mode            9994 non-null   object        
 5   customer_id          9994 non-null   object        
 6   customer_name        9994 non-null   object        
 7   segment              9994 non-null   object        
 8   country              9994 non-null   object        
 9   city                 9994 non-null   object        
 10  state                9994 non-null   object        
 11  postal_code          9994 non-null   int64         
 12  region               9994 non-null   object        
 13  retail_sales_people  9994 non-nul

In [7]:
# Check the consistency between 'order_date' and 'ship_date'
inconsistent_dates = retail[retail['ship_date'] < retail['order_date']]
print(f"Number of inconsistent date entries: {len(inconsistent_dates)}")

Number of inconsistent date entries: 0


In [9]:
retail.columns

Index(['row_id', 'order_id', 'order_date', 'ship_date', 'ship_mode',
       'customer_id', 'customer_name', 'segment', 'country', 'city', 'state',
       'postal_code', 'region', 'retail_sales_people', 'product_id',
       'category', 'sub_category', 'product_name', 'returned', 'sales',
       'quantity', 'discount', 'profit'],
      dtype='object')

In [10]:
# Check unique values in categorical columns
categorical_columns = ['region', 'country',  'city', 'state', 'segment', 'category', 'sub_category', 'product_name']
for col in categorical_columns:
    unique_values = retail[col].unique()
    print(f"Unique values in '{col}': {unique_values}\n")
    

Unique values in 'region': ['South' 'West' 'Central' 'East']

Unique values in 'country': ['United States']

Unique values in 'city': ['Henderson' 'Los Angeles' 'Fort Lauderdale' 'Concord' 'Seattle'
 'Fort Worth' 'Madison' 'West Jordan' 'San Francisco' 'Fremont'
 'Philadelphia' 'Orem' 'Houston' 'Richardson' 'Naperville' 'Melbourne'
 'Eagan' 'Westland' 'Dover' 'New Albany' 'New York City' 'Troy' 'Chicago'
 'Gilbert' 'Springfield' 'Jackson' 'Memphis' 'Decatur' 'Durham' 'Columbia'
 'Rochester' 'Minneapolis' 'Portland' 'Saint Paul' 'Aurora' 'Charlotte'
 'Orland Park' 'Urbandale' 'Columbus' 'Bristol' 'Wilmington' 'Bloomington'
 'Phoenix' 'Roseville' 'Independence' 'Pasadena' 'Newark' 'Franklin'
 'Scottsdale' 'San Jose' 'Edmond' 'Carlsbad' 'San Antonio' 'Monroe'
 'Fairfield' 'Grand Prairie' 'Redlands' 'Hamilton' 'Westfield' 'Akron'
 'Denver' 'Dallas' 'Whittier' 'Saginaw' 'Medina' 'Dublin' 'Detroit'
 'Tampa' 'Santa Clara' 'Lakeville' 'San Diego' 'Brentwood' 'Chapel Hill'
 'Morristown' 'Cincin

Các cột dữ liệu dạng danh mục không gặp tình trạng duplicate, tên riêng ở định dạng chuẩn và duy nhất.

In [None]:
import os 
os.makedirs('../../[DATASET C] Retail Supply Chain Sales Analysis/processed/', exist_ok=True)
retail.to_csv('../../[DATASET C] Retail Supply Chain Sales Analysis/processed/retail_cleaned.csv', index=False)
calendar.to_csv('../../[DATASET C] Retail Supply Chain Sales Analysis/processed/calendar_cleaned.csv', index=False)