In [4]:
import pandas as pd
import numpy as np

# Load transformed data
df_full = pd.read_csv("transformed/transformed_full.csv")  # Ensure correct path

print("========== DATA QUALITY CHECKS ==========\n")

# ---------- ORDER_ID ----------
print("🆔 ORDER_ID")
print("- Dtype:", df_full['order_id'].dtype)
print("- Missing:", df_full['order_id'].isnull().sum())
print("- Duplicates:", df_full['order_id'].duplicated().sum())
print("- Unique:", df_full['order_id'].nunique())
print("- Monotonic Increasing:", df_full['order_id'].is_monotonic_increasing)
print()

# ---------- CUSTOMER_NAME ----------
print("👤 CUSTOMER_NAME")
print("- Dtype:", df_full['customer_name'].dtype)
print("- Missing:", df_full['customer_name'].isnull().sum())
print("- Unique:", df_full['customer_name'].nunique(dropna=True))
print("- Leading/Trailing spaces:", df_full['customer_name'].astype(str).str.contains(r'^\s|\s$', regex=True).sum())
print("- Blank values (after strip):", (df_full['customer_name'].astype(str).str.strip() == '').sum())
print()

# ---------- PRODUCT ----------
print("📦 PRODUCT")
print("- Dtype:", df_full['product'].dtype)
print("- Missing:", df_full['product'].isnull().sum())
print("- Unique:", df_full['product'].nunique(dropna=True))
print("- Leading/Trailing spaces:", df_full['product'].astype(str).str.contains(r'^\s|\s$', regex=True).sum())
print("- Blank values (after strip):", (df_full['product'].astype(str).str.strip() == '').sum())
print()

# ---------- QUANTITY ----------
print("🔢 QUANTITY")
print("- Dtype:", df_full['quantity'].dtype)
print("- Missing:", df_full['quantity'].isnull().sum())
print("- Negative values:", (df_full['quantity'] < 0).sum())
print("- Descriptive Stats:\n", df_full['quantity'].describe())
print()

# ---------- UNIT_PRICE ----------
print("💰 UNIT_PRICE")
df_full['unit_price'] = pd.to_numeric(df_full['unit_price'], errors='coerce')
print("- Dtype:", df_full['unit_price'].dtype)
print("- Missing:", df_full['unit_price'].isnull().sum())
print("- Negative values:", (df_full['unit_price'] < 0).sum())
print("- Zero values:", (df_full['unit_price'] == 0).sum())
print("- Descriptive Stats:\n", df_full['unit_price'].describe())
print()

# ---------- ORDER_DATE ----------
print("🗓 ORDER_DATE")
df_full['order_date'] = pd.to_datetime(df_full['order_date'], errors='coerce')
print("- Dtype:", df_full['order_date'].dtype)
print("- Missing (after parse):", df_full['order_date'].isnull().sum())
print("- Future Dates (>2026):", (df_full['order_date'].dt.year > 2026).sum())
print("- Min:", df_full['order_date'].min())
print("- Max:", df_full['order_date'].max())
print()

# ---------- REGION ----------
print("🌍 REGION")
print("- Dtype:", df_full['region'].dtype)
print("- Missing:", df_full['region'].isnull().sum())
print("- Unique:", df_full['region'].nunique(dropna=True))
print("- Leading/Trailing spaces:", df_full['region'].astype(str).str.contains(r'^\s|\s$', regex=True).sum())
print("- Blank values (after strip):", (df_full['region'].astype(str).str.strip() == '').sum())
print()



🆔 ORDER_ID
- Dtype: int64
- Missing: 0
- Duplicates: 0
- Unique: 100
- Monotonic Increasing: False

👤 CUSTOMER_NAME
- Dtype: object
- Missing: 0
- Unique: 7
- Leading/Trailing spaces: 0
- Blank values (after strip): 0

📦 PRODUCT
- Dtype: object
- Missing: 0
- Unique: 4
- Leading/Trailing spaces: 0
- Blank values (after strip): 0

🔢 QUANTITY
- Dtype: float64
- Missing: 0
- Negative values: 0
- Descriptive Stats:
 count    100.00000
mean       1.97000
std        0.70288
min        1.00000
25%        1.00000
50%        2.00000
75%        2.00000
max        3.00000
Name: quantity, dtype: float64

💰 UNIT_PRICE
- Dtype: float64
- Missing: 0
- Negative values: 0
- Zero values: 0
- Descriptive Stats:
 count    100.000000
mean     530.000000
std      211.057941
min      250.000000
25%      250.000000
50%      500.000000
75%      750.000000
max      750.000000
Name: unit_price, dtype: float64

🗓 ORDER_DATE
- Dtype: datetime64[ns]
- Missing (after parse): 0
- Future Dates (>2026): 0
- Min: 2024-