# Session 3.1: Static Visualization dengan Matplotlib & Seaborn

**Durasi:** 45 menit  
**Dataset:** RUP 2025

## Tujuan Pembelajaran
- Membuat visualisasi static dengan Matplotlib
- Menggunakan Seaborn untuk statistical plots
- Customization charts untuk presentasi
- Best practices visualisasi data

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Setting untuk visualisasi
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully!")

## 2. Load Dataset RUP

In [None]:
# Load data
data_path = Path('../../../datasets/rup/RUP-PaketPenyedia-Terumumkan-2025.parquet')
df = pd.read_parquet(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 3. Matplotlib Basics

### 3.1 Histogram - Distribusi Pagu

In [None]:
# Filter data untuk visualisasi yang lebih baik (remove outliers ekstrim)
pagu_filtered = df[df['pagu'] < df['pagu'].quantile(0.95)]['pagu']

# Create histogram
fig, ax = plt.subplots(figsize=(12, 6))

ax.hist(pagu_filtered / 1_000_000, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
ax.set_title('Distribusi Pagu Pengadaan RUP 2025\n(< Percentile 95)', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Pagu (Juta Rupiah)', fontsize=12)
ax.set_ylabel('Frekuensi', fontsize=12)
ax.grid(True, alpha=0.3)

# Add statistics
mean_pagu = pagu_filtered.mean() / 1_000_000
median_pagu = pagu_filtered.median() / 1_000_000
ax.axvline(mean_pagu, color='red', linestyle='--', linewidth=2, label=f'Mean: Rp {mean_pagu:.2f}M')
ax.axvline(median_pagu, color='green', linestyle='--', linewidth=2, label=f'Median: Rp {median_pagu:.2f}M')
ax.legend()

plt.tight_layout()
plt.show()

print(f"Mean Pagu: Rp {mean_pagu:.2f} Juta")
print(f"Median Pagu: Rp {median_pagu:.2f} Juta")

### 3.2 Bar Chart - Top 10 Satker by Total Pagu

In [None]:
# Aggregate data
top_satker = df.groupby('nama_satker')['pagu'].sum().sort_values(ascending=False).head(10)

# Create horizontal bar chart
fig, ax = plt.subplots(figsize=(12, 8))

bars = ax.barh(range(len(top_satker)), top_satker.values / 1_000_000_000, color='coral')
ax.set_yticks(range(len(top_satker)))
ax.set_yticklabels([name[:60] + '...' if len(name) > 60 else name for name in top_satker.index])
ax.set_xlabel('Total Pagu (Miliar Rupiah)', fontsize=12, fontweight='bold')
ax.set_title('Top 10 Satker by Total Pagu Pengadaan', fontsize=14, fontweight='bold', pad=20)
ax.grid(axis='x', alpha=0.3)

# Add value labels
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height()/2, 
            f'Rp {width:.2f}M',
            ha='left', va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

### 3.3 Pie Chart - Distribusi Metode Pengadaan

In [None]:
# Count metode pengadaan
metode_count = df['metode_pengadaan'].value_counts().head(7)

# Create pie chart
fig, ax = plt.subplots(figsize=(10, 8))

colors = sns.color_palette('pastel')
wedges, texts, autotexts = ax.pie(metode_count.values, 
                                    labels=metode_count.index,
                                    autopct='%1.1f%%',
                                    startangle=90,
                                    colors=colors)

# Styling
for text in texts:
    text.set_fontsize(10)
for autotext in autotexts:
    autotext.set_color('black')
    autotext.set_fontweight('bold')
    autotext.set_fontsize(9)

ax.set_title('Distribusi Metode Pengadaan\n(Top 7 Methods)', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nJumlah Paket per Metode:")
print(metode_count)

### 3.4 Line Chart - Trend Pengumuman Over Time

In [None]:
# Convert tanggal to datetime
if 'tanggal_buat_paket' in df.columns:
    df['tanggal'] = pd.to_datetime(df['tanggal_buat_paket'], errors='coerce')
    
    # Aggregate by date
    daily_trend = df.groupby(df['tanggal'].dt.date).size()
    
    # Create line chart
    fig, ax = plt.subplots(figsize=(14, 6))
    
    ax.plot(daily_trend.index, daily_trend.values, linewidth=2, color='darkblue', marker='o', markersize=3)
    ax.fill_between(daily_trend.index, daily_trend.values, alpha=0.3)
    
    ax.set_title('Trend Pengumuman Paket Pengadaan per Hari', fontsize=14, fontweight='bold', pad=20)
    ax.set_xlabel('Tanggal', fontsize=12)
    ax.set_ylabel('Jumlah Paket', fontsize=12)
    ax.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("Column 'tanggal_buat_paket' not found in dataset")

## 4. Seaborn Statistical Plots

### 4.1 Box Plot - Pagu by Metode Pengadaan

In [None]:
# Filter top 5 methods and reasonable pagu range
top_methods = df['metode_pengadaan'].value_counts().head(5).index
df_filtered = df[df['metode_pengadaan'].isin(top_methods) & (df['pagu'] < df['pagu'].quantile(0.90))]

# Create box plot
fig, ax = plt.subplots(figsize=(14, 7))

sns.boxplot(data=df_filtered, x='metode_pengadaan', y='pagu', ax=ax, palette='Set2')
ax.set_title('Distribusi Pagu by Metode Pengadaan\n(Top 5 Methods, < Percentile 90)', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Metode Pengadaan', fontsize=12, fontweight='bold')
ax.set_ylabel('Pagu (Rupiah)', fontsize=12, fontweight='bold')
ax.set_xticklabels(ax.get_xticklabels(), rotation=15, ha='right')

# Format y-axis to show in millions
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x/1e6:.0f}M'))

plt.tight_layout()
plt.show()

### 4.2 Violin Plot - Pagu Distribution

In [None]:
# Create violin plot
fig, ax = plt.subplots(figsize=(14, 7))

sns.violinplot(data=df_filtered, x='metode_pengadaan', y='pagu', ax=ax, palette='muted')
ax.set_title('Violin Plot: Distribusi Pagu by Metode Pengadaan', 
             fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Metode Pengadaan', fontsize=12, fontweight='bold')
ax.set_ylabel('Pagu (Rupiah)', fontsize=12, fontweight='bold')
ax.set_xticklabels(ax.get_xticklabels(), rotation=15, ha='right')
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{x/1e6:.0f}M'))

plt.tight_layout()
plt.show()

### 4.3 Count Plot - Paket per Metode

In [None]:
# Create count plot
fig, ax = plt.subplots(figsize=(12, 6))

sns.countplot(data=df, y='metode_pengadaan', order=df['metode_pengadaan'].value_counts().head(10).index,
              palette='viridis', ax=ax)
ax.set_title('Top 10 Metode Pengadaan by Jumlah Paket', fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Jumlah Paket', fontsize=12, fontweight='bold')
ax.set_ylabel('Metode Pengadaan', fontsize=12, fontweight='bold')

# Add value labels
for container in ax.containers:
    ax.bar_label(container, fmt='%d', padding=3, fontsize=9, fontweight='bold')

plt.tight_layout()
plt.show()

### 4.4 Heatmap - Correlation Matrix

In [None]:
# Select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns: {numeric_cols}")

if len(numeric_cols) > 1:
    # Calculate correlation
    corr = df[numeric_cols].corr()
    
    # Create heatmap
    fig, ax = plt.subplots(figsize=(10, 8))
    
    sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
    ax.set_title('Correlation Matrix - Numeric Variables', fontsize=14, fontweight='bold', pad=20)
    
    plt.tight_layout()
    plt.show()
else:
    print("Not enough numeric columns for correlation matrix")

## 5. Subplots - Multiple Charts

In [None]:
# Create figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Dashboard Analisis RUP 2025', fontsize=16, fontweight='bold', y=1.00)

# Chart 1: Histogram
axes[0, 0].hist(pagu_filtered / 1_000_000, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribusi Pagu', fontweight='bold')
axes[0, 0].set_xlabel('Pagu (Juta Rp)')
axes[0, 0].set_ylabel('Frekuensi')
axes[0, 0].grid(True, alpha=0.3)

# Chart 2: Top 5 Satker
top5_satker = df.groupby('nama_satker')['pagu'].sum().sort_values(ascending=False).head(5)
axes[0, 1].barh(range(len(top5_satker)), top5_satker.values / 1_000_000_000, color='coral')
axes[0, 1].set_yticks(range(len(top5_satker)))
axes[0, 1].set_yticklabels([name[:40] + '...' for name in top5_satker.index], fontsize=8)
axes[0, 1].set_title('Top 5 Satker by Pagu', fontweight='bold')
axes[0, 1].set_xlabel('Total Pagu (Miliar Rp)')

# Chart 3: Metode Pengadaan
metode_top5 = df['metode_pengadaan'].value_counts().head(5)
axes[1, 0].bar(range(len(metode_top5)), metode_top5.values, color='lightgreen', edgecolor='black')
axes[1, 0].set_xticks(range(len(metode_top5)))
axes[1, 0].set_xticklabels(metode_top5.index, rotation=45, ha='right', fontsize=8)
axes[1, 0].set_title('Top 5 Metode Pengadaan', fontweight='bold')
axes[1, 0].set_ylabel('Jumlah Paket')
axes[1, 0].grid(axis='y', alpha=0.3)

# Chart 4: Pie Chart
metode_pie = df['metode_pengadaan'].value_counts().head(5)
axes[1, 1].pie(metode_pie.values, labels=metode_pie.index, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Distribusi Metode (Top 5)', fontweight='bold')

plt.tight_layout()
plt.show()

## 6. Export Visualizations

In [None]:
# Create output directory
output_dir = Path('../outputs')
output_dir.mkdir(exist_ok=True)

# Example: Save the last figure
fig.savefig(output_dir / 'rup_dashboard.png', dpi=300, bbox_inches='tight')
print(f"✅ Chart saved to {output_dir / 'rup_dashboard.png'}")

## 7. Summary & Key Takeaways

### Matplotlib:
- ✅ Histogram untuk distribusi data
- ✅ Bar chart untuk perbandingan kategori
- ✅ Line chart untuk trend temporal
- ✅ Pie chart untuk proporsi
- ✅ Subplots untuk multiple visualizations

### Seaborn:
- ✅ Box plot untuk distribusi dengan outliers
- ✅ Violin plot untuk distribusi density
- ✅ Count plot untuk categorical frequency
- ✅ Heatmap untuk correlation matrix

### Best Practices:
- ✅ Gunakan color palettes yang konsisten
- ✅ Tambahkan title, labels, dan legends
- ✅ Filter outliers untuk visualisasi yang lebih baik
- ✅ Export dengan high resolution (dpi=300)

## 8. Exercises

**Coba buat visualisasi berikut:**

1. Scatter plot: Hubungan antara 2 variabel numerik
2. Stacked bar chart: Metode pengadaan by kategori
3. Area chart: Cumulative paket over time
4. Pair plot: Multiple variables relationship

**Tips:**
- Eksperimen dengan color palettes berbeda
- Tambahkan annotations untuk insights penting
- Gunakan grid dan styling untuk readability