In [3]:
import pandas as pd
import numpy as np
import random
from datetime import date
from itertools import product

# Set a seed for reproducibility
random.seed(42)
np.random.seed(42)

# 1. Define Realistic Context and Kebele Data

# Confirmed Urban Kebeles for Wolaita Sodo City Administration (7)
URBAN_KEBELES = [
    'Arada', 'Dil Begerera', 'Fana Womba', 'Larena Amba', 'Mehal Amba', 
    'Merkato Yushuwa', 'Wadu Amba'
]

# Plausible Kebeles representing the broader District/Periphery (8)
PERIPHERY_KEBELES = [
    'Sodo-Kore', 'Bale-Wogene', 'Hembecho', 'Shanto', 'Dubbo', 
    'Ocholo', 'Gesuba', 'Humbo-Aba'
]

KEBELE_NAMES = URBAN_KEBELES + PERIPHERY_KEBELES

# Simulate populations with a high bias for urban areas
kebele_populations = {}
kebele_growth_potential = {}

for kebele in KEBELE_NAMES:
    # Urban areas generally have higher density
    is_urban = kebele in URBAN_KEBELES
    
    # Population simulation: Urban ~20,000 | Periphery ~12,000
    mean_pop = 22000 if is_urban else 13000
    population = int(random.gauss(mean_pop, 4000))
    kebele_populations[kebele] = max(5000, population) # Ensure minimum population
    
    # Growth potential simulation: Urban areas often have higher acquisition ceiling
    kebele_growth_potential[kebele] = random.uniform(1.0, 1.4) if is_urban else random.uniform(0.7, 1.1)

# --- Export Kebele Population Dataset ---
population_df = pd.DataFrame(
    list(kebele_populations.items()), 
    columns=['Kebele', 'Population']
)
population_df['Growth_Potential_Factor'] = population_df['Kebele'].map(kebele_growth_potential)
population_df['Type'] = population_df['Kebele'].apply(lambda x: 'Urban' if x in URBAN_KEBELES else 'Periphery')

population_df.to_csv('kebele_population.csv', index=False)
print("Generated kebele_population.csv with urban/periphery bias.")


# --- 2. Generate Time Series Subscriber Data with Trends and Seasonality ---

start_date = date(2023, 1, 1)
end_date = date(2024, 12, 1) # 24 months of data
date_range = pd.date_range(start=start_date, end=end_date, freq='MS')

SEGMENTS = ['Individual User', 'Business', 'Telebirr Adopter']

data = []

# Base Acquisition Rates (per 1000 people per month)
BASE_ACQ_RATE = {
    'Individual User': 1.6, # Urban focus
    'Business': 0.4, 
    'Telebirr Adopter': 0.5 
}

for dt in date_range:
    month = dt.month
    year = dt.year
    
    # 1. Overall Trend: Simulate a general market growth YoY (25% better in 2024)
    year_growth_multiplier = 1.0 if year == 2023 else 1.25 
    
    # 2. Seasonality: Simulate a slight dip around July/August (month 7/8)
    seasonal_factor = 1.0 + 0.2 * np.sin(2 * np.pi * (month - 3) / 12)
    
    for kebele in KEBELE_NAMES:
        population = kebele_populations[kebele]
        kebele_factor = kebele_growth_potential[kebele]
        
        for segment in SEGMENTS:
            
            # --- Segment-Specific Trend: Telebirr Campaign Success in Year 2 ---
            segment_multiplier = 1.0
            if segment == 'Telebirr Adopter':
                if year == 2024:
                    segment_multiplier = 1.8 
                elif year == 2023 and month > 6:
                    segment_multiplier = 1.2
            
            # Calculate mean acquisitions
            acq_mean = (BASE_ACQ_RATE[segment] * (population / 1000)) * \
                       year_growth_multiplier * seasonal_factor * \
                       kebele_factor * segment_multiplier
                       
            # Add random noise
            new_acquisitions = max(0, int(np.random.normal(acq_mean, acq_mean * 0.15)))
            
            # Churn (Higher churn in kebeles with lower inherent growth potential, suggesting issues)
            base_churn_rate = 0.05 
            churn_factor = 1.0 + (1.0 - kebele_factor) * 0.5 
            
            monthly_churn = int(new_acquisitions * (base_churn_rate * churn_factor) + random.randint(0, 5))
            
            data.append({
                'Date': dt.strftime('%Y-%m-%d'),
                'Kebele': kebele,
                'Segment': segment,
                'New_Subscribers': new_acquisitions,
                'Monthly_Churn': monthly_churn,
            })

subscriber_df = pd.DataFrame(data)

# Export Subscriber Data Dataset
subscriber_df.to_csv('monthly_subscriber_data.csv', index=False)
print("Generated monthly_subscriber_data.csv")
print("\n--- Kebele Population Preview ---")
print(population_df.head().to_markdown(index=False))
print("\n--- Subscriber Data Preview ---")
print(subscriber_df.head().to_markdown(index=False))


Generated kebele_population.csv with urban/periphery bias.
Generated monthly_subscriber_data.csv

--- Kebele Population Preview ---
| Kebele       |   Population |   Growth_Potential_Factor | Type   |
|:-------------|-------------:|--------------------------:|:-------|
| Arada        |        21423 |                   1.11001 | Urban  |
| Dil Begerera |        21308 |                   1.08928 | Urban  |
| Fana Womba   |        21489 |                   1.35687 | Urban  |
| Larena Amba  |        16010 |                   1.03478 | Urban  |
| Mehal Amba   |        21132 |                   1.08746 | Urban  |

--- Subscriber Data Preview ---
| Date       | Kebele       | Segment          |   New_Subscribers |   Monthly_Churn |
|:-----------|:-------------|:-----------------|------------------:|----------------:|
| 2023-01-01 | Arada        | Individual User  |                33 |               6 |
| 2023-01-01 | Arada        | Business         |                 7 |               3 |
| 20