In [5]:
import numpy as np
import pandas as pd
import os
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
from src.data_enrich import DataEnrichment

In [6]:
initial_df = pd.read_csv("../data/raw/ethiopia_fi_unified_data.csv")

In [7]:
# 1. Initialize your class with your current data
# (Assuming your initial CSV is loaded as 'initial_df')
profiler = DataEnrichment(initial_df)


# 2. Define the Enriched Array of Objects
new_data = [
    # --- ACCESS PILLAR: Infrastructure & Surveys ---
    {
        'record_id': 'REC_0011', 'record_type': 'observation', 'pillar': 'ACCESS',
        'indicator': 'Mobile Subscription Penetration', 'indicator_code': 'ACC_MOBILE_PEN',
        'value_numeric': 61.4, 'observation_date': '2025-12-31', 'source_name': 'DataReportal',
        'source_url': 'https://datareportal.com/reports/digital-2026-ethiopia', 'confidence': 'high',
        'notes': '93.7M connections / 152.7M population'
    },
    {
        'record_id': 'REC_0012', 'record_type': 'observation', 'pillar': 'ACCESS',
        'indicator': 'Fayda Digital ID Enrollment', 'indicator_code': 'ACC_FAYDA',
        'value_numeric': 12000000.0, 'observation_date': '2025-02-28', 'source_name': 'World Bank',
        'confidence': 'high', 'notes': 'Over 12 million registered as of Feb 2025'
    },
    {
        'record_id': 'REC_0013', 'record_type': 'observation', 'pillar': 'ACCESS',
        'indicator': 'Account ownership, total (% age 15+)', 'indicator_code': 'ACC_OWNERSHIP',
        'value_numeric': 49.0, 'observation_date': '2024-12-31', 'source_name': 'World Bank Findex 2025',
        'source_url': 'https://www.worldbank.org/en/publication/globalfindex', 'confidence': 'high',
        'notes': 'Stagnation noted: grew only 3% since 2021 despite digital account surge.'
    },
    {
        'record_id': 'REC_0014', 'record_type': 'observation', 'pillar': 'ACCESS',
        'indicator': 'Account ownership, female (% age 15+)', 'indicator_code': 'ACC_OWN_FEMALE',
        'value_numeric': 41.6, 'observation_date': '2024-12-31', 'source_name': 'World Bank Findex 2025',
        'confidence': 'high', 'notes': 'Gender gap persists at approx 14%.'
    },
    {
        'record_id': 'REC_0015', 'record_type': 'observation', 'pillar': 'ACCESS',
        'indicator': 'Registered Mobile Money Agents', 'indicator_code': 'ACC_MM_AGENTS',
        'value_numeric': 500000.0, 'observation_date': '2025-06-30', 'source_name': 'NBE Annual Report 2024',
        'source_url': 'https://nbe.gov.et/', 'confidence': 'high',
        'notes': 'Massive expansion of agent network to over half a million.'
    },
    {
        'record_id': 'REC_0016', 'record_type': 'observation', 'pillar': 'ACCESS',
        'indicator': 'Mobile Phone Ownership (% adults)', 'indicator_code': 'ACC_PHONE_OWN',
        'value_numeric': 58.0, 'observation_date': '2024-12-31', 'source_name': 'ITU DataHub',
        'source_url': 'https://datahub.itu.int/', 'confidence': 'high',
        'notes': 'Critical barrier: Phone ownership is significantly lower than SSA average (81%).'
    },
    {
        'record_id': 'REC_0017', 'record_type': 'observation', 'pillar': 'ACCESS',
        'indicator': 'Smartphone Penetration Rate', 'indicator_code': 'ACC_SMARTPHONE',
        'value_numeric': 21.7, 'observation_date': '2025-10-01', 'source_name': 'DataReportal 2026',
        'source_url': 'https://datareportal.com/reports/digital-2026-ethiopia', 'confidence': 'high',
        'notes': 'Significant barrier: only ~22% have internet-enabled smartphones.'
    },

    # --- USAGE PILLAR: Transactions & Adoption ---
    {
        'record_id': 'REC_0018', 'record_type': 'observation', 'pillar': 'USAGE',
        'indicator': 'P2P Transaction Count', 'indicator_code': 'USG_P2P_COUNT',
        'value_numeric': 128300000.0, 'observation_date': '2025-07-07', 'source_name': 'EthSwitch',
        'source_url': 'https://ethswitch.com/', 'confidence': 'high',
        'notes': '128.3 million P2P transactions, +158% YoY growth'
    },
    {
        'record_id': 'REC_0019', 'record_type': 'observation', 'pillar': 'USAGE',
        'indicator': 'P2P Transaction Value', 'indicator_code': 'USG_P2P_VALUE',
        'value_numeric': 577700000000.0, 'observation_date': '2025-07-07', 'source_name': 'EthSwitch',
        'source_url': 'https://ethswitch.com/', 'confidence': 'high',
        'notes': 'ETB 577.7 billion total value'
    },
    {
        'record_id': 'REC_0020', 'record_type': 'observation', 'pillar': 'USAGE',
        'indicator': 'Mobile Money Account Ownership (% age 15+)', 'indicator_code': 'USG_MM_ACC',
        'value_numeric': 19.4, 'observation_date': '2024-12-31', 'source_name': 'IMF FAS 2025',
        'source_url': 'https://data.imf.org/', 'confidence': 'high',
        'notes': 'Quadrupled from 4.7% in 2021. Primary growth engine.'
    },
    {
        'record_id': 'REC_0021', 'record_type': 'observation', 'pillar': 'USAGE',
        'indicator': 'Total Digital Transaction Value (ETB)', 'indicator_code': 'USG_DIG_VAL',
        'value_numeric': 7700000000000.0, 'observation_date': '2024-07-07', 'source_name': 'NBE Annual Report',
        'confidence': 'high', 'notes': 'Reached 7.7 Trillion ETB in FY2023/24.'
    },
    {
        'record_id': 'REC_0022', 'record_type': 'observation', 'pillar': 'USAGE',
        'indicator': 'M-Pesa Active Users (90-day)', 'indicator_code': 'USG_MPESA_ACTIVE',
        'value_numeric': 5000000.0, 'observation_date': '2026-01-21', 'source_name': 'Safaricom Ethiopia',
        'confidence': 'high', 'notes': 'Measuring private sector competition impact.'
    },

    # --- POLICY TARGETS & EVENTS ---
    {
        'record_id': 'REC_0023', 'record_type': 'target', 'pillar': 'ACCESS',
        'indicator': 'Target Account Ownership Rate', 'indicator_code': 'ACC_OWN_TGT',
        'value_numeric': 70.0, 'observation_date': '2027-12-31', 'source_name': 'NFIS-II/III Projections',
        'confidence': 'medium', 'notes': 'Ambitious target set by NBE for 2027.'
    },
    {
        'record_id': 'REC_0024', 'record_type': 'event', 'category': 'policy',
        'event_name': 'National ID (Fayda) Mandate', 'event_date': '2025-01-01',
        'notes': 'Mandating Fayda for financial services to streamline KYC.'
    },
    # Add these to your 'new_data' list in the script
{
    'record_id': 'REC_0025', 'record_type': 'observation', 'pillar': 'USAGE',
    'indicator': 'Active Mobile Money Accounts (90-day)', 'indicator_code': 'USG_MM_ACTIVE',
    'value_numeric': 60.0, 'unit': 'Millions', 'observation_date': '2025-12-31', 
    'source_name': 'NBE / Digital Ethiopia 2030 Strategy', 'confidence': 'high',
    'notes': 'Significant leap from 12M in 2020 to 60M+ by end of 2025.'
},
{
    'record_id': 'REC_0026', 'record_type': 'observation', 'pillar': 'ACCESS',
    'indicator': 'Bank Account to Fayda Linkage Rate', 'indicator_code': 'ACC_FAYDA_BANK_LINK',
    'value_numeric': 100.0, 'unit': 'Percent', 'observation_date': '2026-03-30', 
    'source_name': 'NBE Directive', 'confidence': 'high',
    'notes': 'NBE mandate for all accounts to be linked to Fayda by March 2026 (EC 2018).'
},
# Add these to your 'new_data' list in the script
{
    'record_id': 'REC_0027', 'record_type': 'observation', 'pillar': 'USAGE',
    'indicator': 'Active Mobile Money Accounts (90-day)', 'indicator_code': 'USG_MM_ACTIVE',
    'value_numeric': 60.0, 'unit': 'Millions', 'observation_date': '2025-12-31', 
    'source_name': 'NBE / Digital Ethiopia 2030 Strategy', 'confidence': 'high',
    'notes': 'Significant leap from 12M in 2020 to 60M+ by end of 2025.'
},
{
    'record_id': 'REC_0028', 'record_type': 'observation', 'pillar': 'ACCESS',
    'indicator': 'Bank Account to Fayda Linkage Rate', 'indicator_code': 'ACC_FAYDA_BANK_LINK',
    'value_numeric': 100.0, 'unit': 'Percent', 'observation_date': '2026-03-30', 
    'source_name': 'NBE Directive', 'confidence': 'high',
    'notes': 'NBE mandate for all accounts to be linked to Fayda by March 2026 (EC 2018).'
}
]



In [8]:
import os
import pandas as pd

# 1. Define the full schema as per the starter dataset
original_columns = [
    "record_id", "record_type", "category", "pillar", "indicator", "indicator_code", 
    "indicator_direction", "value_numeric", "value_text", "value_type", "unit", 
    "observation_date", "period_start", "period_end", "fiscal_year", "gender", 
    "location", "region", "source_name", "source_type", "source_url", "confidence", 
    "related_indicator", "relationship_type", "impact_direction", "impact_magnitude", 
    "impact_estimate", "lag_months", "evidence_basis", "comparable_country", 
    "collected_by", "collection_date", "original_text", "notes"
]

# 2. Enrich the data in memory
profiler.enrich_data(new_data)

# 3. Align the dataframe to the full schema 
# This adds all the missing columns (category, fiscal_year, etc.) and fills them with NaN
profiler.df = profiler.df.reindex(columns=original_columns)

# 4. Prepare the directory
output_dir = "../data/processed"
output_file = "ethiopia_fi_unified_data.csv"
full_path = os.path.join(output_dir, output_file)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 5. Save the data
# We use index=False because record_id is already a column in your original_columns list
profiler.df.to_csv(full_path, index=False)

print(f"âœ… Data successfully saved with full schema to: {full_path}")
print(f"ðŸ“Š Summary of records by Type/Pillar:\n{profiler.get_summary()}")

--- Enrichment Success ---
Current Columns: ['record_id', 'record_type', 'category', 'pillar', 'indicator', 'indicator_code', 'indicator_direction', 'value_numeric', 'value_text', 'value_type', 'unit', 'observation_date', 'period_start', 'period_end', 'fiscal_year', 'gender', 'location', 'region', 'source_name', 'source_type', 'source_url', 'confidence', 'related_indicator', 'relationship_type', 'impact_direction', 'impact_magnitude', 'impact_estimate', 'lag_months', 'evidence_basis', 'comparable_country', 'collected_by', 'collection_date', 'original_text', 'notes', 'event_name', 'event_date']
Total Rows: 43
âœ… Data successfully saved with full schema to: ../data/processed\ethiopia_fi_unified_data.csv
ðŸ“Š Summary of records by Type/Pillar:
record_type  pillar
observation  ACCESS    19
             GENDER     2
             USAGE      7
target       ACCESS     3
             GENDER     1
dtype: int64
