# 02 - Data Cleaning
## Nusantara Food Watch - Clean and Validate Data

**Purpose:** Clean extracted data, remove outliers, handle missing values

**Input:** CSV files from `data/interim/`

**Output:** Cleaned CSV files in `data/processed/`

---

## Setup

In [None]:
# Add project root to Python path
import sys
from pathlib import Path

project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

print(f"üìÅ Project root: {project_root}")

In [None]:
# Standard imports
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Our custom utilities
from src.data_analysis.utils import (
    DataSaver, 
    save_csv,
    setup_plot_style,
    check_missing_values,
    detect_outliers_iqr
)

from src.data_analysis.config import (
    INTERIM_DIR, 
    PROCESSED_DIR, 
    FIGURES_DIR
)

# Setup plotting style
setup_plot_style()
%matplotlib inline

print("‚úÖ Imports complete!")
print(f"\nüìÅ Working directories:")
print(f"   Input (Interim): {INTERIM_DIR}")
print(f"   Output (Processed): {PROCESSED_DIR}")
print(f"   Figures: {FIGURES_DIR}")

## Configuration

In [None]:
# Input file from interim folder
INPUT_FILE = 'extracted_data.csv'  # Change this to your interim file

# Output file for processed folder
OUTPUT_FILE = 'cleaned_data.csv'

# Cleaning parameters
REMOVE_NULLS = True
REMOVE_OUTLIERS = True
OUTLIER_METHOD = 'iqr'  # 'iqr' or 'zscore'
IQR_MULTIPLIER = 1.5  # Standard is 1.5

print(f"üì• Input: {INPUT_FILE}")
print(f"üì§ Output: {OUTPUT_FILE}")
print(f"üßπ Remove nulls: {REMOVE_NULLS}")
print(f"üßπ Remove outliers: {REMOVE_OUTLIERS}")
print(f"üßπ Outlier method: {OUTLIER_METHOD}")

## Load Data

In [None]:
# Load from interim folder
df = pd.read_csv(INTERIM_DIR / INPUT_FILE)

print(f"‚úÖ Loaded {len(df):,} records")
print(f"üìä Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")

In [None]:
# Preview
df.head()

---
## Your Analysis Here

Use the cells below for your data cleaning logic.

In [None]:
# Example: Check missing values
# missing = check_missing_values(df)
# print(missing)

In [None]:
# Example: Detect outliers
# outliers, lower, upper = detect_outliers_iqr(df, 'harga')
# print(f"Outliers: {outliers.sum()}")

---
## Save Cleaned Data

In [None]:
# Example: Save to processed folder
# save_csv(df_clean, OUTPUT_FILE, processed=True)