# ðŸŒ´ðŸŒŠ SawitFlood Lab - Exploratory Data Analysis

**Analisis Keterkaitan Deforestasi Kelapa Sawit dan Risiko Banjir di Sumatra**

Notebook ini melakukan eksplorasi data awal untuk:
1. Memahami distribusi dan karakteristik data
2. Mengidentifikasi pola dan korelasi
3. Mendeteksi outlier dan data quality issues
4. Visualisasi spasial dan temporal


In [None]:
# Import libraries
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option("display.max_columns", 50)
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("husl")

# Add project root to path
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))
print(f"Project root: {PROJECT_ROOT}")

## 1. Load Data


In [None]:
# Load preprocessed data
processed_dir = PROJECT_ROOT / "data" / "processed"

gpkg_path = processed_dir / "analysis_dataset.gpkg"
parquet_path = processed_dir / "analysis_dataset.parquet"
csv_path = processed_dir / "analysis_dataset.csv"

gdf, df = None, None

if gpkg_path.exists():
    gdf = gpd.read_file(gpkg_path)
    df = gdf.drop(columns=["geometry"])
    print(f"Loaded GeoPackage: {gpkg_path}")
elif parquet_path.exists():
    df = pd.read_parquet(parquet_path)
    print(f"Loaded Parquet: {parquet_path}")
elif csv_path.exists():
    df = pd.read_csv(csv_path)
    print(f"Loaded CSV: {csv_path}")
else:
    print("No data found. Run data pipeline first.")
    from src.data.build_dataset import DatasetBuilder

    builder = DatasetBuilder()
    gdf = builder.build_analysis_dataset()
    df = gdf.drop(columns=["geometry"])

print(f"\nDataset shape: {df.shape}")
df.head()

## 2. Data Quality Check


In [None]:
# Check for missing values and data types
print("Dataset Info:")
print("=" * 50)
print(df.info())

# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
if missing.sum() > 0:
    print("\nColumns with missing values:")
    print(missing[missing > 0])
else:
    print("\nâœ… No missing values found!")

## 3. Descriptive Statistics


In [None]:
# Numeric columns statistics
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols].describe().round(2)

## 4. Distribution Visualizations


In [None]:
# Plot distributions of key features
key_features = [
    "forest_loss_cumulative_pct",
    "palm_oil_pct",
    "rainfall_annual_mean_mm",
    "risk_probability",
]
available_features = [f for f in key_features if f in df.columns]

if available_features:
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    axes = axes.flatten()

    for i, feature in enumerate(available_features[:4]):
        sns.histplot(df[feature], kde=True, ax=axes[i], color="steelblue")
        axes[i].set_title(f"Distribution of {feature}")
        axes[i].axvline(df[feature].mean(), color="red", linestyle="--", label="Mean")
        axes[i].legend()

    plt.tight_layout()
    plt.show()

## 5. Correlation Analysis


In [None]:
# Correlation matrix
numeric_df = df[numeric_cols].dropna()
if len(numeric_df.columns) > 1:
    corr_matrix = numeric_df.corr()

    plt.figure(figsize=(12, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(
        corr_matrix, mask=mask, annot=True, cmap="RdYlGn_r", center=0, fmt=".2f", linewidths=0.5
    )
    plt.title("Feature Correlation Matrix", fontsize=14)
    plt.tight_layout()
    plt.show()

## 6. Next Steps

Based on this EDA, proceed to:
1. **Model Training**: `02_modeling_risk.ipynb`
2. **SHAP Analysis**: `03_xai_shap_analysis.ipynb`

---
*SawitFlood Lab - Environmental Risk Analysis*
