In [1]:
import pandas as pd

# Load CSV
df = pd.read_csv("sales/expert_sales_data.csv", parse_dates=['sale_date'])

print("✅ CSV Loaded. Shape:", df.shape)

# ---------------------------
# 1. Missing values
# ---------------------------
missing = df.isna().sum()
print("\n--- Missing Values per Column ---")
print(missing[missing > 0] if missing.any() else "No missing values detected.")

# Check important columns specifically
important_cols = ['styles', 'supplier_contact', 'price_after_offer']
print("\n--- Important Columns Missing ---")
print(df[important_cols].isna().sum())

# ---------------------------
# 2. Duplicates
# ---------------------------
dup_sale_id = df['sale_id'].duplicated().sum()
dup_txn_id = df['transaction_id'].duplicated().sum()
print("\n--- Duplicates ---")
print(f"Duplicated sale_id: {dup_sale_id}")
print(f"Duplicated transaction_id: {dup_txn_id}")

# ---------------------------
# 3. Numeric column validation
# ---------------------------
numeric_cols = ['unit_price', 'total_price', 'sold_quantity', 'discount_value', 'price_after_offer']
for col in numeric_cols:
    invalid = df[df[col].isna() | ~df[col].apply(lambda x: isinstance(x,(int,float)))]
    print(f"\nColumn '{col}' - Invalid values: {len(invalid)}")
    if len(invalid) > 0:
        print(invalid[[col,'product_name','sale_date']].head(5))

# ---------------------------
# Optional: basic stats for numeric columns
# ---------------------------
print("\n--- Numeric Columns Stats ---")
print(df[numeric_cols].describe())


✅ CSV Loaded. Shape: (131044, 32)

--- Missing Values per Column ---
holiday_name    87139
dtype: int64

--- Important Columns Missing ---
styles               0
supplier_contact     0
price_after_offer    0
dtype: int64

--- Duplicates ---
Duplicated sale_id: 0
Duplicated transaction_id: 0

Column 'unit_price' - Invalid values: 0

Column 'total_price' - Invalid values: 0

Column 'sold_quantity' - Invalid values: 0

Column 'discount_value' - Invalid values: 0

Column 'price_after_offer' - Invalid values: 0

--- Numeric Columns Stats ---
          unit_price    total_price  sold_quantity  discount_value  \
count  131044.000000  131044.000000  131044.000000        131044.0   
mean     2124.098426    2464.377743       1.160412             0.0   
std      1305.622596    1841.429284       0.418590             0.0   
min       267.750000     267.750000       1.000000             0.0   
25%      1122.000000    1236.000000       1.000000             0.0   
50%      1723.300000    1873.400000  