# Task 1: Data Exploration & Enrichment
Week 10 Challenge – Forecasting Financial Inclusion in Ethiopia  
Dororo – January 31, 2026

In [1]:
# Cell 1: Imports & Paths
import pandas as pd
import os
from datetime import datetime

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
RAW_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw')
PROCESSED_DIR = os.path.join(PROJECT_ROOT, 'data', 'processed')

os.makedirs(PROCESSED_DIR, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Raw data:", RAW_DIR)
print("Processed save to:", PROCESSED_DIR)

Project root: c:\Users\JERUSALEM\ethiopia-financial-inclusion-forecast
Raw data: c:\Users\JERUSALEM\ethiopia-financial-inclusion-forecast\data\raw
Processed save to: c:\Users\JERUSALEM\ethiopia-financial-inclusion-forecast\data\processed


In [2]:
# Cell 2: Convert Excel to CSV if needed (run once)
unified_xlsx = os.path.join(RAW_DIR, 'ethiopia_fi_unified_data.xlsx')
ref_xlsx = os.path.join(RAW_DIR, 'reference_codes.xlsx')

unified_csv = os.path.join(RAW_DIR, 'ethiopia_fi_unified_data.csv')
ref_csv = os.path.join(RAW_DIR, 'reference_codes.csv')

if os.path.exists(unified_xlsx) and not os.path.exists(unified_csv):
    pd.read_excel(unified_xlsx, sheet_name='ethiopia_fi_unified_data').to_csv(unified_csv, index=False, encoding='utf-8')
    print("Converted unified .xlsx → .csv")

if os.path.exists(ref_xlsx) and not os.path.exists(ref_csv):
    pd.read_excel(ref_xlsx, sheet_name='reference_codes').to_csv(ref_csv, index=False, encoding='utf-8')
    print("Converted reference .xlsx → .csv")

In [3]:
# Cell 3: Load data
try:
    df = pd.read_csv(unified_csv, low_memory=False)
    ref = pd.read_csv(ref_csv)
    print("Success! Unified data shape:", df.shape)
    print("\nRecord types distribution:")
    print(df['record_type'].value_counts(dropna=False))
except FileNotFoundError as e:
    print("File missing:", e)
    print("Please ensure .csv files are in data/raw/ (convert from .xlsx if needed)")

Success! Unified data shape: (43, 34)

Record types distribution:
record_type
observation    30
event          10
target          3
Name: count, dtype: int64


In [4]:
# Cell 4: Basic exploration
print("\nColumns:", df.columns.tolist())

# Observations summary
obs = df[df['record_type'] == 'observation'].copy()
obs['observation_date'] = pd.to_datetime(obs['observation_date'], errors='coerce')

print("\nDate range (observations):", obs['observation_date'].min().date(), "to", obs['observation_date'].max().date())
print("Unique indicators:", obs['indicator_code'].unique())

# Account ownership trajectory
acc_own = obs[obs['indicator_code'] == 'ACC_OWNERSHIP']
print("\nAccount Ownership Trajectory:")
display(acc_own[['observation_date', 'value_numeric', 'gender', 'location', 'source_name']].sort_values('observation_date').dropna(subset=['value_numeric']))

# Mobile money
mm_acc = obs[obs['indicator_code'] == 'ACC_MM_ACCOUNT']
print("\nMobile Money Account Rate:")
display(mm_acc[['observation_date', 'value_numeric']])


Columns: ['record_id', 'record_type', 'category', 'pillar', 'indicator', 'indicator_code', 'indicator_direction', 'value_numeric', 'value_text', 'value_type', 'unit', 'observation_date', 'period_start', 'period_end', 'fiscal_year', 'gender', 'location', 'region', 'source_name', 'source_type', 'source_url', 'confidence', 'related_indicator', 'relationship_type', 'impact_direction', 'impact_magnitude', 'impact_estimate', 'lag_months', 'evidence_basis', 'comparable_country', 'collected_by', 'collection_date', 'original_text', 'notes']

Date range (observations): 2014-12-31 to 2025-12-31
Unique indicators: ['ACC_OWNERSHIP' 'ACC_MM_ACCOUNT' 'ACC_4G_COV' 'ACC_MOBILE_PEN'
 'ACC_FAYDA' 'USG_P2P_COUNT' 'USG_P2P_VALUE' 'USG_ATM_COUNT'
 'USG_ATM_VALUE' 'USG_CROSSOVER' 'USG_TELEBIRR_USERS' 'USG_TELEBIRR_VALUE'
 'USG_MPESA_USERS' 'USG_MPESA_ACTIVE' 'USG_ACTIVE_RATE' 'AFF_DATA_INCOME'
 'GEN_GAP_ACC' 'GEN_MM_SHARE' 'GEN_GAP_MOBILE']

Account Ownership Trajectory:


Unnamed: 0,observation_date,value_numeric,gender,location,source_name
0,2014-12-31,22.0,all,national,Global Findex 2014
1,2017-12-31,35.0,all,national,Global Findex 2017
2,2021-12-31,46.0,all,national,Global Findex 2021
3,2021-12-31,56.0,male,national,Global Findex 2021
4,2021-12-31,36.0,female,national,Global Findex 2021
5,2024-11-29,49.0,all,national,Global Findex 2024



Mobile Money Account Rate:


Unnamed: 0,observation_date,value_numeric
6,2021-12-31,4.7
7,2024-11-29,9.45


In [5]:
# Cell 5: Enrichment - New Observations (2025 data)
new_observations = [
    # Findex 2025 confirmation
    {'record_id': 'REC_ENR_001', 'record_type': 'observation', 'pillar': 'ACCESS', 'indicator': 'Account Ownership Rate', 'indicator_code': 'ACC_OWNERSHIP',
     'value_numeric': 49.0, 'observation_date': '2024-12-31', 'gender': 'all', 'location': 'national',
     'source_name': 'Global Findex 2025', 'source_type': 'survey', 'source_url': 'https://www.worldbank.org/en/publication/globalfindex',
     'confidence': 'high', 'notes': 'Confirmed 49% in 2024 (slow +3pp from 2021)'},

    {'record_id': 'REC_ENR_002', 'record_type': 'observation', 'pillar': 'ACCESS', 'indicator': 'Account Ownership (Men)', 'indicator_code': 'ACC_OWN_MALE',
     'value_numeric': 57.0, 'observation_date': '2024-12-31', 'gender': 'male', 'location': 'national',
     'source_name': 'Global Findex 2025', 'confidence': 'high'},

    {'record_id': 'REC_ENR_003', 'record_type': 'observation', 'pillar': 'ACCESS', 'indicator': 'Account Ownership (Women)', 'indicator_code': 'ACC_OWN_FEMALE',
     'value_numeric': 42.0, 'observation_date': '2024-12-31', 'gender': 'female', 'location': 'national',
     'source_name': 'Global Findex 2025', 'confidence': 'high', 'notes': '~15pp gender gap'},

    # Mobile money & usage
    {'record_id': 'REC_ENR_004', 'record_type': 'observation', 'pillar': 'ACCESS', 'indicator': 'Mobile Money Account Rate', 'indicator_code': 'ACC_MM_ACCOUNT',
     'value_numeric': 19.4, 'observation_date': '2025-01-01', 'source_name': 'Findex 2025 / NBE', 'confidence': 'medium', 'notes': 'Growth from 9.45%'},

    {'record_id': 'REC_ENR_005', 'record_type': 'observation', 'pillar': 'USAGE', 'indicator': 'Digital Payment Adoption Rate', 'indicator_code': 'USG_DIG_PAY',
     'value_numeric': 21.0, 'observation_date': '2024-12-31', 'source_name': 'Global Findex 2025', 'confidence': 'high', 'notes': 'Made/received digital payment'},

    # Enablers
    {'record_id': 'REC_ENR_006', 'record_type': 'observation', 'pillar': 'ACCESS', 'indicator': 'Mobile Connections Penetration', 'indicator_code': 'ACC_MOBILE_PEN',
     'value_numeric': 68.4, 'observation_date': '2025-12-31', 'source_name': 'DataReportal Digital 2026', 'confidence': 'high', 'notes': '93.2M connections'},

    {'record_id': 'REC_ENR_007', 'record_type': 'observation', 'pillar': 'ACCESS', 'indicator': 'Internet Penetration', 'indicator_code': 'ACC_INTERNET_PEN',
     'value_numeric': 21.7, 'observation_date': '2025-12-31', 'source_name': 'DataReportal Digital 2026', 'confidence': 'high'},
]

df = pd.concat([df, pd.DataFrame(new_observations)], ignore_index=True)
print("Added 7 new observations. Total rows now:", df.shape[0])

Added 7 new observations. Total rows now: 50


In [6]:
# Cell 6: New Events (2025 real launches)
new_events = [
    {'record_id': 'EVT_ENR_001', 'record_type': 'event', 'category': 'policy', 'indicator': 'National Digital Payments Strategy (NDPS) 2026-2030 Launch',
     'observation_date': '2025-12-08', 'source_name': 'National Bank of Ethiopia', 'source_url': 'https://nbe.gov.et/wp-content/uploads/2025/12/Ethiopia_NDPS_Draft_F.pdf',
     'confidence': 'high', 'notes': 'Launched Dec 8, 2025; targets 60% active accounts, narrow gender gap <3pp, boost digital tx'},

    {'record_id': 'EVT_ENR_002', 'record_type': 'event', 'category': 'infrastructure', 'indicator': 'Instant Payment System (IPS / Ethiopay) Launch',
     'observation_date': '2025-12-09', 'source_name': 'NBE / EthSwitch', 'confidence': 'high', 'notes': 'Real-time payments launched Dec 9, 2025'}
]

df = pd.concat([df, pd.DataFrame(new_events)], ignore_index=True)
print("Added 2 new events.")

Added 2 new events.


In [7]:
# Cell 7: New Impact Links (pillar-specific, neutral events)
new_links = [
    {'record_id': 'IMP_ENR_001', 'parent_id': 'EVT_ENR_001', 'record_type': 'impact_link', 'pillar': 'USAGE',
     'related_indicator': 'USG_DIG_PAY', 'impact_direction': 'increase', 'impact_magnitude': 'high', 'lag_months': 12,
     'evidence_basis': 'policy', 'notes': 'NDPS targets high usage growth (active accounts, tx volume)'},

    {'record_id': 'IMP_ENR_002', 'parent_id': 'EVT_ENR_001', 'record_type': 'impact_link', 'pillar': 'ACCESS',
     'related_indicator': 'ACC_OWNERSHIP', 'impact_direction': 'increase', 'impact_magnitude': 'medium', 'lag_months': 24,
     'notes': 'Supports overall inclusion via interoperability & trust'},

    {'record_id': 'IMP_ENR_003', 'parent_id': 'EVT_ENR_002', 'record_type': 'impact_link', 'pillar': 'USAGE',
     'related_indicator': 'USG_P2P_COUNT', 'impact_direction': 'increase', 'impact_magnitude': 'medium', 'lag_months': 6,
     'notes': 'Real-time IPS boosts P2P and digital transaction volume'}
]

df = pd.concat([df, pd.DataFrame(new_links)], ignore_index=True)
print("Added 3 new impact links.")

Added 3 new impact links.


In [8]:
# Cell 8: Save enriched data & update log
enriched_path = os.path.join(PROCESSED_DIR, 'enriched_fi_unified_data.csv')
df.to_csv(enriched_path, index=False)
print("Enriched dataset saved to:", enriched_path)

log_path = os.path.join(PROJECT_ROOT, 'data_enrichment_log.md')
with open(log_path, 'a', encoding='utf-8') as f:
    f.write(f"\n\n### Enrichment - {datetime.now().strftime('%Y-%m-%d %H:%M EAT')}\n")
    f.write("- Loaded starter data from CSV (converted from .xlsx)\n")
    f.write("- Explored schema, record types, time coverage, ACC_OWNERSHIP trajectory\n")
    f.write("- Added 7 new observations (Findex 2025: 49% access, 21% digital pay, gender gaps; mobile 68.4%, internet 21.7%)\n")
    f.write("- Added 2 new events: NDPS 2026-2030 (Dec 8) & IPS/Ethiopay (Dec 9)\n")
    f.write("- Added 3 new impact_links (high/medium impact on usage/access)\n")
    f.write("Sources: Global Findex 2025, NBE NDPS PDF, DataReportal Digital 2026\n")
    f.write("Prepared by: Dororo\n")

print("Log updated at:", log_path)

Enriched dataset saved to: c:\Users\JERUSALEM\ethiopia-financial-inclusion-forecast\data\processed\enriched_fi_unified_data.csv
Log updated at: c:\Users\JERUSALEM\ethiopia-financial-inclusion-forecast\data_enrichment_log.md
