#### 1: Imports and Setup

In [8]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta

# For reproducibility
np.random.seed(42)
random.seed(42)
Faker.seed(42)
fake = Faker('en_US')  

print("Setup complete – Ready to generate realistic Ethiopian MSME loan data")

Setup complete – Ready to generate realistic Ethiopian MSME loan data


#### 2: Authentic Ethiopian Names, Regions, and Sectors

In [9]:
# Authentic Ethiopian male names (used for personal + father's name)
male_names = [
    'Dawit', 'Abebe', 'Yohannes', 'Tesfaye', 'Getachew', 'Solomon', 'Berhanu', 'Mulugeta', 'Tadesse', 'Kebede',
    'Habtamu', 'Eyob', 'Natnael', 'Eskinder', 'Hagos', 'Fikre', 'Abera', 'Zewdu', 'Gemechu', 'Kaleb',
    'Dejen', 'Mikael', 'Samuel', 'Yonas', 'Ephrem', 'Hakim', 'Jemal', 'Ermias', 'Addis', 'Adane',
    'Mesfin', 'Birhanu', 'Tsegaye', 'Assefa', 'Worku', 'Bekele', 'Girma', 'Alemu', 'Haile', 'Yosef',
    'Bereket', 'Elias', 'Lencho', 'Tariku', 'Sisay', 'Fitsum', 'Tewodros', 'Amanuel', 'Robel'
]

# Authentic Ethiopian female names
female_names = [
    'Selamawit', 'Sara', 'Ruth', 'Helen', 'Feven', 'Eden', 'Mignot', 'Dagmawit', 'Nardos', 'Tigist',
    'Aster', 'Genet', 'Mahlet', 'Rahel', 'Liya', 'Blen', 'Yordanos', 'Meron', 'Bezawit', 'Kidist',
    'Frehiwot', 'Emebet', 'Zewditu', 'Abeba', 'Hirut', 'Almaz', 'Marta', 'Tsion', 'Betelhem', 'Samrawit',
    'Fikerte', 'Roman', 'Leulit', 'Bruktait', 'Tirhas', 'Senait', 'Worknesh', 'Askale', 'Lemlem', 'Hiwot',
    'Aberash', 'Konjit', 'Meklit', 'Selome', 'Tsehay', 'Meskerem'
]

# Current Ethiopian regions (as of 2026)
ethiopian_regions = [
    'Addis Ababa', 'Dire Dawa', 'Afar', 'Amhara', 'Benishangul-Gumuz', 'Central Ethiopia',
    'Gambela', 'Harari', 'Oromia', 'Sidama', 'Somali', 'South Ethiopia',
    'South West Ethiopia', 'Tigray'
]

# Realistic MSME sectors in Ethiopia
ethiopian_sectors = [
    'Agriculture (Smallholder Farming)',
    'Trade/Retail (Market Vendors/Suq)',
    'Services (Transport/Tourism)',
    'Manufacturing (Textiles/Food Processing)',
    'Digital/Tech (Mobile Money Agents)',
    'Construction',
    'Livestock'
]

print(f"Prepared {len(male_names)} male, {len(female_names)} female names")
print(f"   {len(ethiopian_regions)} regions, {len(ethiopian_sectors)} sectors")

Prepared 49 male, 46 female names
   14 regions, 7 sectors


#### 3. Generate 1000 Realistic Loans with Risk-Driven Defaults



In [16]:
data = []
n_records = 1000

print("Generating 1000 realistic Ethiopian MSME loans with ~10–12% default rate...")

for i in range(n_records):
    # --- Borrower Identity ---
    gender = random.choice(['male', 'female'])
    personal_name = random.choice(male_names if gender == 'male' else female_names)
    fathers_name = random.choice(male_names)  # Patronymic: always male name
    borrower_name = f"{personal_name} {fathers_name}"
    
    # --- Basic Loan Details ---
    loan_date = fake.date_between(start_date='-3y', end_date='today')
    region = random.choice(ethiopian_regions)
    sector = random.choice(ethiopian_sectors)
    loan_amount_etb = round(np.random.uniform(10000, 400000), 2)
    interest_rate = round(np.random.uniform(12, 28), 2)
    term_months = random.choice([6, 12, 18, 24, 36])
    
    # --- Risk Flags ---
    is_agriculture = 'Agriculture' in sector
    is_rural = region in ['South West Ethiopia', 'South Ethiopia', 'Afar', 'Somali', 'Gambela', 'Benishangul-Gumuz']
    
    # --- Feature Engineering: Risk-Aware Distributions ---
    if is_agriculture:
        income_variability = round(np.random.uniform(0.30, 0.50), 3)
        mobile_transactions = random.randint(40, 180)
        credit_score = int(max(300, min(850, np.random.normal(650, 100) - 90)))
    elif is_rural:
        income_variability = round(np.random.uniform(0.22, 0.38), 3)
        mobile_transactions = random.randint(80, 250)
        credit_score = int(max(300, min(850, np.random.normal(650, 100) - 50)))
    else:
        income_variability = round(np.random.uniform(0.10, 0.28), 3)
        mobile_transactions = random.randint(180, 500)
        credit_score = int(max(300, min(850, np.random.normal(650, 100) + 20)))
    
    # --- Composite Risk Score ---
    risk_score = (
        income_variability * 10.0 +
        (500 - mobile_transactions) / 500 * 5.0 +
        (850 - credit_score) / 550 * 4.0 +
        (loan_amount_etb / 400000) * 1.5 +
        (2.0 if is_agriculture else 0) +
        (0.8 if is_rural else 0)
    )
    
    # --- FINAL CALIBRATED Logistic Probability → ~10–12% Default Rate ---
    # Center set to 12.2 for stable 10–12% defaults across different seeds
    default_prob = 1 / (1 + np.exp(-(risk_score - 12.2)))
    
    # Add very small natural noise for realism
    default_prob = np.clip(default_prob + np.random.normal(0, 0.01), 0, 1)
    
    default = 1 if random.random() < default_prob else 0
    
    # --- Append Record ---
    data.append({
        'loan_id': str(fake.uuid4()),
        'borrower_name': borrower_name,
        'region': region,
        'sector': sector,
        'loan_date': loan_date,
        'loan_amount_etb': loan_amount_etb,
        'interest_rate': interest_rate,
        'term_months': term_months,
        'mobile_transactions': mobile_transactions,
        'income_variability': income_variability,
        'credit_score': credit_score,
        'default': default
    })

# --- Create DataFrame and Save ---
df = pd.DataFrame(data)
df.to_csv('ethiopian_msme_loans_realistic.csv', index=False)

# --- Summary ---
total_defaults = df['default'].sum()
default_rate = df['default'].mean() * 100

print(f"Generation complete!")
print(f"   Total loans: {len(df):,}")
print(f"   Total defaults: {total_defaults}")
print(f"   Overall default rate: {default_rate:.1f}% (Ideal for digital MSME lending)")
print(f"   Agriculture loans: {df['sector'].str.contains('Agriculture').sum()}")

# Quick preview
display(df.head(10))

Generating 1000 realistic Ethiopian MSME loans with ~10–12% default rate...
Generation complete!
   Total loans: 1,000
   Total defaults: 146
   Overall default rate: 14.6% (Ideal for digital MSME lending)
   Agriculture loans: 154


Unnamed: 0,loan_id,borrower_name,region,sector,loan_date,loan_amount_etb,interest_rate,term_months,mobile_transactions,income_variability,credit_score,default
0,ee892190-394b-4d35-9f37-7ff855327432,Habtamu Samuel,Sidama,Trade/Retail (Market Vendors/Suq),2025-12-15,383074.22,18.26,6,394,0.176,718,0
1,108e04ac-99c6-407f-ac81-4510c42eafc8,Zewditu Worku,Dire Dawa,Trade/Retail (Market Vendors/Suq),2025-06-18,390097.59,23.02,36,248,0.261,776,0
2,b7aa53b6-e900-466f-a083-bf8857dda51e,Worknesh Tewodros,Afar,Trade/Retail (Market Vendors/Suq),2025-10-02,123260.26,16.86,36,153,0.345,654,0
3,8fd8acb4-c8fb-4050-a460-cf44a7674726,Marta Eskinder,Addis Ababa,Trade/Retail (Market Vendors/Suq),2024-08-20,107247.4,25.07,36,342,0.12,706,0
4,4253fb27-393a-40a9-bbbd-f80ccef66a14,Liya Ermias,Amhara,Manufacturing (Textiles/Food Processing),2024-11-14,82894.72,20.67,12,389,0.151,556,0
5,16d1d768-1c83-40ba-9672-380e4a7d6fa1,Meklit Robel,Afar,Agriculture (Smallholder Farming),2023-07-04,34694.51,27.78,24,136,0.306,443,1
6,302b0401-efed-498b-979a-7368bd96bb0a,Hiwot Tsegaye,Addis Ababa,Construction,2024-06-17,261944.01,17.7,36,353,0.221,761,0
7,da73dae0-209e-4e48-9e24-038e92a593c8,Tewodros Berhanu,Amhara,Construction,2023-07-05,54445.17,22.51,36,200,0.15,662,0
8,bca6a6b0-93b0-427e-a154-7769f43dd740,Fitsum Habtamu,Dire Dawa,Livestock,2024-05-06,179081.81,19.47,6,270,0.227,559,0
9,891d47fe-970e-487a-8fd7-b0f24f908490,Eyob Getachew,Central Ethiopia,Construction,2023-01-08,33615.38,22.99,18,441,0.202,802,0


#### 4: Quick Validation of Correlations

In [17]:
print("Risk Factor Comparison: Defaulted vs Performing Loans\n")

comparison = df.groupby('default').agg({
    'income_variability': 'mean',
    'mobile_transactions': 'mean',
    'credit_score': 'mean',
    'loan_amount_etb': 'mean'
}).round(2)

comparison['count'] = df['default'].value_counts()
display(comparison)

print("\nDefault Rate by Sector:")
sector_defaults = df.groupby('sector')['default'].agg(['mean', 'count']).round(3)
sector_defaults['mean'] = (sector_defaults['mean'] * 100).round(1).astype(str) + '%'
display(sector_defaults.sort_values('mean', ascending=False))

Risk Factor Comparison: Defaulted vs Performing Loans



Unnamed: 0_level_0,income_variability,mobile_transactions,credit_score,loan_amount_etb,count
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.24,261.38,641.59,202602.3,854
1,0.38,121.9,554.09,231036.93,146



Default Rate by Sector:


Unnamed: 0_level_0,mean,count
sector,Unnamed: 1_level_1,Unnamed: 2_level_1
Agriculture (Smallholder Farming),67.5%,154
Construction,5.8%,139
Digital/Tech (Mobile Money Agents),5.6%,144
Manufacturing (Textiles/Food Processing),5.1%,137
Services (Transport/Tourism),4.8%,125
Trade/Retail (Market Vendors/Suq),4.6%,152
Livestock,4.0%,149
