# GAA Budget Machine Learning - Quick Start

This notebook demonstrates how to use the ML pipeline for budget analysis.

## Contents
1. Load and explore data
2. Feature engineering
3. Train models
4. Generate predictions
5. Visualize results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load GAA Budget Data

In [None]:
# Load the main dataset
df = pd.read_parquet('../../gaa.parquet')

print(f"Dataset shape: {df.shape}")
print(f"Years available: {sorted(df['year'].unique())}")
print(f"Total budget: ₱{df['amt'].sum():,.2f}")

df.head()

## 2. Explore Data Distribution

In [None]:
# Budget by year
yearly_budget = df.groupby('year')['amt'].sum().reset_index()

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.bar(yearly_budget['year'], yearly_budget['amt'] / 1e12)
plt.xlabel('Year')
plt.ylabel('Budget (Trillion ₱)')
plt.title('Total Budget by Year')
plt.grid(axis='y', alpha=0.3)

# Top departments
plt.subplot(1, 2, 2)
top_depts = df.groupby('uacs_dpt_dsc')['amt'].sum().nlargest(10)
top_depts.plot(kind='barh')
plt.xlabel('Budget (₱)')
plt.title('Top 10 Departments')
plt.tight_layout()
plt.show()

## 3. Feature Engineering

Run the feature engineering script to create ML-ready features.

In [None]:
# You can run the feature engineering script from here
# Or use the command line: python ../features/feature_engineering.py

# Load engineered features if already created
features_path = Path('../features/budget_features.parquet')

if features_path.exists():
    df_features = pd.read_parquet(features_path)
    print(f"Loaded features: {df_features.shape}")
    print(f"New features added: {df_features.shape[1] - df.shape[1]}")
    
    # Show sample of new features
    new_cols = [col for col in df_features.columns if col not in df.columns]
    print(f"\nNew feature columns: {new_cols[:10]}...")
else:
    print("Features not yet generated. Run: python ../features/feature_engineering.py")

## 4. View Model Results

Load and visualize trained model predictions.

In [None]:
# Load forecast predictions
predictions_dir = Path('../predictions')

if predictions_dir.exists():
    # Find latest forecast file
    forecast_files = list(predictions_dir.glob('budget_forecast_*.parquet'))
    
    if forecast_files:
        latest_forecast = sorted(forecast_files)[-1]
        df_forecast = pd.read_parquet(latest_forecast)
        
        print(f"Loaded forecast: {latest_forecast.name}")
        print(f"Predictions shape: {df_forecast.shape}")
        print(f"Total predicted budget: ₱{df_forecast['predicted_amt'].sum():,.2f}")
        
        df_forecast.head()
    else:
        print("No forecast files found")
else:
    print("Predictions directory not found. Run the ML pipeline first.")

In [None]:
# Load anomaly detection results
anomaly_files = list(predictions_dir.glob('anomalies_*.parquet'))

if anomaly_files:
    latest_anomalies = sorted(anomaly_files)[-1]
    df_anomalies = pd.read_parquet(latest_anomalies)
    
    print(f"Loaded anomalies: {latest_anomalies.name}")
    print(f"Number of anomalies: {len(df_anomalies)}")
    
    # Plot anomaly distribution
    if 'anomaly_type' in df_anomalies.columns:
        plt.figure(figsize=(10, 5))
        df_anomalies['anomaly_type'].value_counts().plot(kind='bar')
        plt.xlabel('Anomaly Type')
        plt.ylabel('Count')
        plt.title('Anomaly Detection Results')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    
    df_anomalies.head()
else:
    print("No anomaly files found")

In [None]:
# Load clustering results
cluster_files = list(predictions_dir.glob('spending_clusters_*.parquet'))

if cluster_files:
    latest_clusters = sorted(cluster_files)[-1]
    df_clusters = pd.read_parquet(latest_clusters)
    
    print(f"Loaded clusters: {latest_clusters.name}")
    print(f"Number of entities clustered: {len(df_clusters)}")
    
    # Plot cluster distribution
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    df_clusters['cluster_label'].value_counts().plot(kind='bar')
    plt.xlabel('Cluster')
    plt.ylabel('Count')
    plt.title('Cluster Distribution')
    plt.xticks(rotation=45, ha='right')
    
    # PCA visualization if available
    if 'pca_1' in df_clusters.columns and 'pca_2' in df_clusters.columns:
        plt.subplot(1, 2, 2)
        scatter = plt.scatter(df_clusters['pca_1'], df_clusters['pca_2'], 
                            c=df_clusters['cluster_id'], cmap='viridis', alpha=0.6)
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.title('Cluster Visualization (PCA)')
        plt.colorbar(scatter, label='Cluster ID')
    
    plt.tight_layout()
    plt.show()
    
    df_clusters.head()
else:
    print("No cluster files found")

## 5. Next Steps

- Experiment with different model parameters
- Add custom features based on domain knowledge
- Integrate predictions into the dashboard
- Set up automated retraining pipeline
- Create custom visualizations