# Session 4.1: Data Cleaning & Transformation

**Durasi:** 60 menit  
**Dataset:** RUP 2025

## Tujuan Pembelajaran
- Mendeteksi dan menangani missing values
- Mendeteksi dan menangani outliers
- Encoding categorical variables
- Feature engineering
- Data transformation techniques

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Visualization settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("✅ Libraries imported successfully!")

## 2. Load Dataset

In [None]:
# Load data
data_path = Path('../../../datasets/rup/RUP-PaketPenyedia-Terumumkan-2025.parquet')
df_original = pd.read_parquet(data_path)

# Create working copy
df = df_original.copy()

print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
df.head()

## 3. Data Quality Assessment

### 3.1 Check Data Info

In [None]:
# Basic info
print("=" * 50)
print("DATA INFO")
print("=" * 50)
df.info()

print("\n" + "=" * 50)
print("COLUMN NAMES")
print("=" * 50)
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

### 3.2 Missing Values Analysis

In [None]:
# Calculate missing values
missing = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percentage': (df.isnull().sum().values / len(df) * 100).round(2),
    'Dtype': df.dtypes.values
})

missing = missing[missing['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

print("=" * 70)
print("MISSING VALUES SUMMARY")
print("=" * 70)
print(missing.to_string(index=False))
print(f"\nTotal columns with missing values: {len(missing)}")

# Visualize missing values
if len(missing) > 0:
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.barh(missing['Column'], missing['Missing_Percentage'], color='coral')
    ax.set_xlabel('Missing Percentage (%)', fontweight='bold')
    ax.set_title('Missing Values by Column', fontsize=14, fontweight='bold', pad=20)
    ax.grid(axis='x', alpha=0.3)
    
    # Add value labels
    for i, (col, pct) in enumerate(zip(missing['Column'], missing['Missing_Percentage'])):
        ax.text(pct, i, f' {pct:.1f}%', va='center', fontweight='bold')
    
    plt.tight_layout()
    plt.show()

### 3.3 Handling Missing Values

In [None]:
# Strategy 1: Drop columns with > 50% missing
high_missing_cols = missing[missing['Missing_Percentage'] > 50]['Column'].tolist()
print(f"Columns to drop (>50% missing): {high_missing_cols}")

if high_missing_cols:
    df_clean = df.drop(columns=high_missing_cols)
    print(f"Dropped {len(high_missing_cols)} columns")
else:
    df_clean = df.copy()
    print("No columns to drop")

# Strategy 2: Fill numeric missing with median
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df_clean[col].isnull().sum() > 0:
        median_val = df_clean[col].median()
        df_clean[col].fillna(median_val, inplace=True)
        print(f"Filled {col} missing values with median: {median_val:,.0f}")

# Strategy 3: Fill categorical missing with 'Unknown'
categorical_cols = df_clean.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if df_clean[col].isnull().sum() > 0:
        df_clean[col].fillna('Unknown', inplace=True)
        print(f"Filled {col} missing values with 'Unknown'")

# Verify
print(f"\n✅ Missing values after cleaning: {df_clean.isnull().sum().sum()}")

## 4. Outlier Detection & Treatment

### 4.1 Visualize Outliers - Pagu

In [None]:
# Check pagu outliers
if 'pagu' in df_clean.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Box plot
    axes[0].boxplot(df_clean['pagu'] / 1_000_000, vert=True)
    axes[0].set_ylabel('Pagu (Juta Rupiah)', fontweight='bold')
    axes[0].set_title('Box Plot: Pagu (with outliers)', fontweight='bold')
    axes[0].grid(axis='y', alpha=0.3)
    
    # Histogram
    axes[1].hist(df_clean['pagu'] / 1_000_000, bins=50, color='steelblue', edgecolor='black')
    axes[1].set_xlabel('Pagu (Juta Rupiah)', fontweight='bold')
    axes[1].set_ylabel('Frequency', fontweight='bold')
    axes[1].set_title('Histogram: Pagu Distribution', fontweight='bold')
    axes[1].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Statistics
    print("Pagu Statistics:")
    print(df_clean['pagu'].describe())

### 4.2 IQR Method for Outlier Detection

In [None]:
def detect_outliers_iqr(data, column):
    """
    Detect outliers using IQR method
    """
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    
    print(f"Column: {column}")
    print(f"Q1 (25th percentile): {Q1:,.0f}")
    print(f"Q3 (75th percentile): {Q3:,.0f}")
    print(f"IQR: {IQR:,.0f}")
    print(f"Lower Bound: {lower_bound:,.0f}")
    print(f"Upper Bound: {upper_bound:,.0f}")
    print(f"Number of outliers: {len(outliers)} ({len(outliers)/len(data)*100:.2f}%)")
    
    return outliers, lower_bound, upper_bound

# Detect outliers in pagu
if 'pagu' in df_clean.columns:
    outliers, lower, upper = detect_outliers_iqr(df_clean, 'pagu')

### 4.3 Z-Score Method for Outlier Detection

In [None]:
def detect_outliers_zscore(data, column, threshold=3):
    """
    Detect outliers using Z-score method
    """
    z_scores = np.abs(stats.zscore(data[column].dropna()))
    outliers_mask = z_scores > threshold
    
    print(f"Column: {column}")
    print(f"Z-score threshold: {threshold}")
    print(f"Number of outliers: {outliers_mask.sum()} ({outliers_mask.sum()/len(data)*100:.2f}%)")
    
    return outliers_mask

# Detect with Z-score
if 'pagu' in df_clean.columns:
    zscore_outliers = detect_outliers_zscore(df_clean, 'pagu')

### 4.4 Handle Outliers - Capping (Winsorization)

In [None]:
# Option 1: Cap outliers using percentiles
if 'pagu' in df_clean.columns:
    df_capped = df_clean.copy()
    
    # Cap at 1st and 99th percentile
    lower_cap = df_capped['pagu'].quantile(0.01)
    upper_cap = df_capped['pagu'].quantile(0.99)
    
    df_capped['pagu_capped'] = df_capped['pagu'].clip(lower=lower_cap, upper=upper_cap)
    
    print(f"Original pagu range: {df_clean['pagu'].min():,.0f} - {df_clean['pagu'].max():,.0f}")
    print(f"Capped pagu range: {df_capped['pagu_capped'].min():,.0f} - {df_capped['pagu_capped'].max():,.0f}")
    
    # Visualize before/after
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].hist(df_clean['pagu'] / 1_000_000, bins=50, color='coral', edgecolor='black', alpha=0.7)
    axes[0].set_title('Before Capping', fontweight='bold')
    axes[0].set_xlabel('Pagu (Juta Rp)')
    axes[0].set_ylabel('Frequency')
    
    axes[1].hist(df_capped['pagu_capped'] / 1_000_000, bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
    axes[1].set_title('After Capping (1%-99%)', fontweight='bold')
    axes[1].set_xlabel('Pagu (Juta Rp)')
    axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

## 5. Data Transformation

### 5.1 Log Transformation for Skewed Data

In [None]:
if 'pagu' in df_clean.columns:
    # Log transformation
    df_clean['pagu_log'] = np.log1p(df_clean['pagu'])  # log1p = log(1+x) to handle zeros
    
    # Compare distributions
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Original
    axes[0].hist(df_clean['pagu'] / 1_000_000, bins=50, color='steelblue', edgecolor='black')
    axes[0].set_title(f'Original Distribution\nSkewness: {df_clean["pagu"].skew():.2f}', fontweight='bold')
    axes[0].set_xlabel('Pagu (Juta Rp)')
    axes[0].set_ylabel('Frequency')
    
    # Log transformed
    axes[1].hist(df_clean['pagu_log'], bins=50, color='coral', edgecolor='black')
    axes[1].set_title(f'Log Transformed\nSkewness: {df_clean["pagu_log"].skew():.2f}', fontweight='bold')
    axes[1].set_xlabel('Log(Pagu)')
    axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Original skewness: {df_clean['pagu'].skew():.3f}")
    print(f"Log transformed skewness: {df_clean['pagu_log'].skew():.3f}")

### 5.2 Normalization & Standardization

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

if 'pagu' in df_clean.columns:
    # Normalization (0-1)
    scaler_minmax = MinMaxScaler()
    df_clean['pagu_normalized'] = scaler_minmax.fit_transform(df_clean[['pagu']])
    
    # Standardization (mean=0, std=1)
    scaler_standard = StandardScaler()
    df_clean['pagu_standardized'] = scaler_standard.fit_transform(df_clean[['pagu']])
    
    # Compare
    print("Original Pagu:")
    print(df_clean['pagu'].describe())
    
    print("\nNormalized Pagu (0-1):")
    print(df_clean['pagu_normalized'].describe())
    
    print("\nStandardized Pagu (mean=0, std=1):")
    print(df_clean['pagu_standardized'].describe())

## 6. Encoding Categorical Variables

### 6.1 Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

if 'metode_pengadaan' in df_clean.columns:
    # Label encoding
    le = LabelEncoder()
    df_clean['metode_encoded'] = le.fit_transform(df_clean['metode_pengadaan'])
    
    # Show mapping
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print("Label Encoding Mapping:")
    for category, code in sorted(mapping.items(), key=lambda x: x[1]):
        print(f"  {code}: {category}")
    
    # Compare
    print("\nSample comparison:")
    print(df_clean[['metode_pengadaan', 'metode_encoded']].drop_duplicates().head(10))

### 6.2 One-Hot Encoding

In [None]:
if 'metode_pengadaan' in df_clean.columns:
    # Get top 5 methods to avoid too many columns
    top_methods = df_clean['metode_pengadaan'].value_counts().head(5).index
    df_subset = df_clean[df_clean['metode_pengadaan'].isin(top_methods)].copy()
    
    # One-hot encoding
    df_onehot = pd.get_dummies(df_subset['metode_pengadaan'], prefix='metode')
    
    print("One-Hot Encoded columns:")
    print(df_onehot.columns.tolist())
    
    print("\nSample:")
    print(df_onehot.head())
    
    print(f"\nOriginal shape: {df_subset.shape}")
    print(f"After one-hot encoding: {pd.concat([df_subset, df_onehot], axis=1).shape}")

### 6.3 Frequency Encoding

In [None]:
if 'metode_pengadaan' in df_clean.columns:
    # Frequency encoding
    freq_map = df_clean['metode_pengadaan'].value_counts().to_dict()
    df_clean['metode_frequency'] = df_clean['metode_pengadaan'].map(freq_map)
    
    print("Frequency Encoding:")
    print(df_clean[['metode_pengadaan', 'metode_frequency']].drop_duplicates().sort_values('metode_frequency', ascending=False).head(10))

## 7. Feature Engineering

### 7.1 Date Features

In [None]:
if 'tanggal_buat_paket' in df_clean.columns:
    # Convert to datetime
    df_clean['tanggal'] = pd.to_datetime(df_clean['tanggal_buat_paket'], errors='coerce')
    
    # Extract date features
    df_clean['year'] = df_clean['tanggal'].dt.year
    df_clean['month'] = df_clean['tanggal'].dt.month
    df_clean['quarter'] = df_clean['tanggal'].dt.quarter
    df_clean['day_of_week'] = df_clean['tanggal'].dt.dayofweek
    df_clean['day_name'] = df_clean['tanggal'].dt.day_name()
    df_clean['is_weekend'] = df_clean['day_of_week'].isin([5, 6]).astype(int)
    df_clean['is_month_end'] = df_clean['tanggal'].dt.is_month_end.astype(int)
    df_clean['is_quarter_end'] = df_clean['tanggal'].dt.is_quarter_end.astype(int)
    df_clean['is_year_end'] = df_clean['tanggal'].dt.is_year_end.astype(int)
    
    print("Date Features Created:")
    date_features = ['tanggal', 'year', 'month', 'quarter', 'day_of_week', 'day_name', 
                     'is_weekend', 'is_month_end', 'is_quarter_end', 'is_year_end']
    print(df_clean[date_features].head(10))

### 7.2 Binning - Create Categories

In [None]:
if 'pagu' in df_clean.columns:
    # Create pagu categories
    bins = [0, 100_000_000, 500_000_000, 1_000_000_000, 5_000_000_000, float('inf')]
    labels = ['Sangat Kecil', 'Kecil', 'Sedang', 'Besar', 'Sangat Besar']
    
    df_clean['pagu_category'] = pd.cut(df_clean['pagu'], bins=bins, labels=labels)
    
    print("Pagu Categories:")
    print(df_clean['pagu_category'].value_counts().sort_index())
    
    # Visualize
    fig, ax = plt.subplots(figsize=(10, 6))
    df_clean['pagu_category'].value_counts().plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
    ax.set_title('Distribusi Kategori Pagu', fontsize=14, fontweight='bold', pad=20)
    ax.set_xlabel('Kategori Pagu', fontweight='bold')
    ax.set_ylabel('Jumlah Paket', fontweight='bold')
    ax.grid(axis='y', alpha=0.3)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

### 7.3 Text Features (if applicable)

In [None]:
# Check if there's a name/description column
text_cols = [col for col in df_clean.columns if 'nama' in col.lower() or 'keterangan' in col.lower()]
print(f"Potential text columns: {text_cols}")

if 'nama_paket' in df_clean.columns:
    # Text length
    df_clean['nama_paket_length'] = df_clean['nama_paket'].str.len()
    
    # Word count
    df_clean['nama_paket_word_count'] = df_clean['nama_paket'].str.split().str.len()
    
    print("Text Features:")
    print(df_clean[['nama_paket', 'nama_paket_length', 'nama_paket_word_count']].head())
    
    print("\nText Statistics:")
    print(df_clean[['nama_paket_length', 'nama_paket_word_count']].describe())

## 8. Data Quality Report

In [None]:
print("=" * 70)
print("DATA CLEANING SUMMARY REPORT")
print("=" * 70)

print(f"\n1. DATASET SIZE")
print(f"   Original: {df_original.shape}")
print(f"   After Cleaning: {df_clean.shape}")
print(f"   Rows removed: {len(df_original) - len(df_clean)}")
print(f"   Columns added: {len(df_clean.columns) - len(df_original.columns)}")

print(f"\n2. MISSING VALUES")
print(f"   Original: {df_original.isnull().sum().sum()}")
print(f"   After Cleaning: {df_clean.isnull().sum().sum()}")

print(f"\n3. DUPLICATES")
print(f"   Duplicate rows: {df_clean.duplicated().sum()}")

print(f"\n4. NEW FEATURES CREATED")
new_cols = set(df_clean.columns) - set(df_original.columns)
for col in sorted(new_cols):
    print(f"   - {col}")

print(f"\n5. DATA TYPES")
print(df_clean.dtypes.value_counts())

print("\n" + "=" * 70)

## 9. Export Cleaned Data

In [None]:
# Create output directory
output_dir = Path('../../../data')
output_dir.mkdir(exist_ok=True)

# Export cleaned data
output_path = output_dir / 'rup_cleaned.parquet'
df_clean.to_parquet(output_path, index=False)
print(f"✅ Cleaned data saved to: {output_path}")
print(f"   File size: {output_path.stat().st_size / 1024**2:.2f} MB")

## 10. Key Takeaways

### Missing Values:
- ✅ Drop columns with >50% missing
- ✅ Fill numeric with median/mean
- ✅ Fill categorical with mode/'Unknown'

### Outliers:
- ✅ IQR method for detection
- ✅ Z-score method for detection
- ✅ Capping/Winsorization for treatment
- ✅ Log transformation for skewed data

### Encoding:
- ✅ Label Encoding untuk ordinal
- ✅ One-Hot Encoding untuk nominal
- ✅ Frequency Encoding untuk high cardinality

### Feature Engineering:
- ✅ Date features (year, month, quarter, etc.)
- ✅ Binning untuk categories
- ✅ Text features (length, word count)
- ✅ Derived features dari existing data