# 02_Data_Cleaning.ipynb

## 1. Identify & quantify missing values  
## 2. Define imputation strategies  
## 3. Detect & handle outliers


In [4]:
import pandas as pd

# Load raw data
df = pd.read_csv("../data/raw/transactions.csv")

# 1️⃣ Quantify missing values
print("Missing values per column:\n", df.isnull().sum())

# 2️⃣ Select only numeric columns for outlier detection
num_df = df.select_dtypes(include="number")

# 3️⃣ Compute IQR and flag outliers
Q1 = num_df.quantile(0.25)
Q3 = num_df.quantile(0.75)
IQR = Q3 - Q1

outliers = ((num_df < (Q1 - 1.5 * IQR)) | (num_df > (Q3 + 1.5 * IQR))).sum()
print("\nOutliers per numeric column (IQR method):\n", outliers)


Missing values per column:
 order_id       0
customer_id    0
order_date     0
price          0
quantity       0
dtype: int64

Outliers per numeric column (IQR method):
 order_id    0
price       0
quantity    0
dtype: int64
