In [1]:
import sys
import pandas as pd
sys.path.append('../')
from src.data_loader import load_and_unify_data, add_enriched_record

In [2]:
file_path = '../data/raw/ethiopia_fi_unified_data.xlsx'
df = load_and_unify_data(file_path)

In [3]:
df.head()

Unnamed: 0,record_id,record_type,category,pillar,indicator,indicator_code,indicator_direction,value_numeric,value_text,value_type,...,impact_direction,impact_magnitude,impact_estimate,lag_months,evidence_basis,comparable_country,collected_by,collection_date,original_text,notes
0,REC_0001,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,22.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Baseline year,
1,REC_0002,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,35.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,,
2,REC_0003,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,46.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,,
3,REC_0004,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,56.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,
4,REC_0005,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,36.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,


In [4]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   record_id            43 non-null     str           
 1   record_type          43 non-null     str           
 2   category             10 non-null     str           
 3   pillar               33 non-null     str           
 4   indicator            43 non-null     str           
 5   indicator_code       43 non-null     str           
 6   indicator_direction  33 non-null     str           
 7   value_numeric        33 non-null     float64       
 8   value_text           10 non-null     str           
 9   value_type           43 non-null     str           
 10  unit                 33 non-null     str           
 11  observation_date     43 non-null     datetime64[us]
 12  period_start         10 non-null     datetime64[us]
 13  period_end           10 non-null     datetime64[

In [5]:
print("1. Counts by Type/Pillar/Confidence:")
print(df.groupby(['record_type', 'pillar', 'confidence']).size())

print("\n2. Temporal Range:")
print(f"Start: {df['observation_date'].min()} | End: {df['observation_date'].max()}")

print("\n3. Unique Indicators Coverage:")
print(df[df['record_type']=='observation']['indicator_code'].value_counts())

1. Counts by Type/Pillar/Confidence:
record_type  pillar         confidence
observation  ACCESS         high          14
             AFFORDABILITY  medium         1
             GENDER         high           3
                            medium         1
             USAGE          high          11
target       ACCESS         high           2
             GENDER         medium         1
dtype: int64

2. Temporal Range:
Start: 2014-12-31 00:00:00 | End: 2030-12-31 00:00:00

3. Unique Indicators Coverage:
indicator_code
ACC_OWNERSHIP         6
ACC_FAYDA             3
ACC_MM_ACCOUNT        2
ACC_4G_COV            2
USG_P2P_COUNT         2
GEN_GAP_ACC           2
ACC_MOBILE_PEN        1
USG_P2P_VALUE         1
USG_ATM_COUNT         1
USG_ATM_VALUE         1
USG_CROSSOVER         1
USG_TELEBIRR_USERS    1
USG_TELEBIRR_VALUE    1
USG_MPESA_USERS       1
USG_MPESA_ACTIVE      1
USG_ACTIVE_RATE       1
AFF_DATA_INCOME       1
GEN_MM_SHARE          1
GEN_GAP_MOBILE        1
Name: count, dtype:

In [6]:
# --- RECORD 1: Updated 2024 Account Ownership ---
findex_2024 = {
    'record_type': 'observation',
    'pillar': 'Access',
    'indicator': 'Account ownership (% of adults)',
    'indicator_code': 'ACC_OWN_TOT',
    'value_numeric': 49, 
    'observation_date': '2024-12-31',
    'source_name': 'World Bank Global Findex 2025',
    'source_url': 'https://microdata.worldbank.org/catalog/7901',
    'original_text': 'Financial account ownership in Ethiopia increased to 49% in 2024.',
    'confidence': 'high',
    'notes': 'Latest benchmark for 2025/2026 forecasting.'
}
df = add_enriched_record(df, findex_2024)

In [7]:

# --- RECORD 2: Banking Liberalization Event ---
policy_event = {
    'record_type': 'event',
    'category': 'policy',
    'pillar': '', # Events must have empty pillar
    'indicator': 'Banking Business Proclamation No. 1360/2024',
    'observation_date': '2024-12-17',
    'source_url': 'https://nbe.gov.et/files/banking-business-proclamation-2/',
    'original_text': 'Approved Dec 2024, allowing foreign banks to enter Ethiopia.',
    'confidence': 'high',
    'notes': 'Major shift expected to boost Access and Quality pillars in 2026.'
}
df = add_enriched_record(df, policy_event)

In [8]:
# --- RECORD 3: Impact Link ---
link_data = {
    'record_type': 'impact_link',
    'parent_id': 'Banking_Proclamation_2024',
    'pillar': 'Access',
    'related_indicator': 'Commercial Bank Branches',
    'impact_direction': 'positive',
    'impact_magnitude': 'high',
    'lag_months': 12,
    'evidence_basis': 'Liberalization historically increases branch competition and tech investment.',
    'source_url': 'NBE Proclamation Analysis',
    'original_text': 'N/A',
    'confidence': 'medium',
    'notes': 'Linking the 2024 policy to future branch growth.'
}
df = add_enriched_record(df, link_data)

In [None]:
df.to_csv('../data/processed/ethiopia_fi_enriched.csv', index=False)
print("Task 1 officially complete with REAL data!")

Task 1 officially complete with REAL data!
