In [3]:
import pandas as pd
from faker import Faker
import random
import numpy as np
import uuid

# Initialize Faker
fake = Faker()
np.random.seed(42) # Set seed for reproducibility

# --- Configuration ---
NUM_CUSTOMERS = 15000 
CSV_FILE = "ethio_telecom_high_precision_churn_data.csv"

# Ethio Telecom specific data categories:
SERVICE_LEVELS = ['Basic Mobile Voice', 'Data/Internet + Voice', 'Premium Bundle (Data, telebirr, VAS)']
CONTRACT_TYPES = ['Month-to-month', '6-Month', '12-Month', '24-Month']
NETWORK_TECHNOLOGIES_ALL = ['4G/LTE', '3G', '2G Only'] 
NETWORK_TECHNOLOGIES_URBAN = ['4G/LTE', '5G']

# --- Function to Generate a Single Customer Record ---
def create_high_precision_customer():
    
    # 1. Base Demographic/Subscription Data
    customer_id = str(uuid.uuid4())
    region = random.choice(['Addis Ababa (High Density)', 'Regional City (Mid Density)', 'Rural Area (Low Density)'])
    contract = random.choice(CONTRACT_TYPES)
    service_plan = random.choice(SERVICE_LEVELS)
    tenure = random.randint(1, 84) 
    
    # Define network_tech early, based on region, to be available for Churn Logic
    if 'Rural' in region:
        # Rural areas lean toward older tech
        network_tech = random.choice(['2G Only', '3G']) 
    elif 'Addis Ababa' in region:
        # High density areas have better tech
        network_tech = random.choice(NETWORK_TECHNOLOGIES_URBAN) 
    else:
        # Mid density areas have a mix
        network_tech = random.choice(NETWORK_TECHNOLOGIES_ALL)
        
    # 2. Derived Charges & Usage
    base_charge = np.random.normal(500, 150) # Base in ETB
    
    # Adjust charge based on service plan and contract length
    if 'Premium' in service_plan:
        monthly_charges = base_charge * 1.5
    elif 'Data/Internet' in service_plan:
        monthly_charges = base_charge * 1.1
    else: # Basic Mobile Voice
        monthly_charges = base_charge * 0.8
        
    monthly_charges = round(monthly_charges, 2)
    
    # Customer Service & Network Quality
    support_calls = np.random.poisson(lam=2)
    outage_score = random.randint(0, 5) # 0=None, 5=Frequent severe outages
    
    # 3. CHURN LOGIC: Utilizing 'network_tech' for high-precision clustering
    churn = False
    
    # Archetype 1: High-Risk (The "Dissatisfied Newcomer") - High Precision Cluster 1
    if contract == 'Month-to-month' and tenure <= 6:
        # Interaction Effect: New customer + high cost + poor service/outages -> HIGH CHURN (90%)
        if monthly_charges > 700 and outage_score >= 3:
            churn = random.random() < 0.90
        # Interaction Effect: New customer + high support calls -> HIGH CHURN (80%)
        elif monthly_charges > 700 and support_calls >= 4:
            churn = random.random() < 0.80

    # Archetype 2: Mid-Risk (The "Network Frustrated") - High Precision Cluster 2
    # Interaction Effect: Old Technology + Severe Outages 
    elif network_tech == '2G Only' and outage_score >= 4:
        # Churn more likely if they've endured it for a while
        if tenure > 12:
            churn = random.random() < 0.65
        else:
            churn = random.random() < 0.35
            
    # Archetype 3: Low-Risk (The "Loyal Subscriber") - High Precision Cluster 3
    elif contract in ['12-Month', '24-Month'] and tenure > 36:
        # Loyal customer only churns if the situation is extreme
        if monthly_charges > 1200 and support_calls >= 5 and outage_score >= 4:
            churn = random.random() < 0.50 
        else:
            churn = random.random() < 0.05 

    # Default/Other cases: Low base churn probability
    else:
        churn = random.random() < 0.10
        
    # Final data assembly
    return {
        'CustomerID': customer_id,
        'Region': region,
        'Contract_Type': contract,
        'Service_Plan': service_plan,
        'Tenure_Months': tenure,
        'Monthly_Charges_ETB': monthly_charges,
        'Support_Calls_3Months': support_calls,
        'Network_Outage_Score_0_5': outage_score,
        'Network_Technology': network_tech, # Now correctly defined
        'Churn': churn  # TARGET VARIABLE
    }

# --- Main Script to Generate and Save Data ---
def generate_synthetic_data(num_records):
    """Generates the full synthetic dataset."""
    print(f"Generating {num_records} high-precision synthetic Ethio Telecom customer records...")
    
    data = [create_high_precision_customer() for _ in range(num_records)]
    df = pd.DataFrame(data)
    
    # Save to CSV
    df.to_csv(CSV_FILE, index=False)
    print(f"✅ Data generation complete. Saved to {CSV_FILE}")
    print("\nDataFrame Head:")
    print(df.head())
    print("\nChurn Distribution (Designed to be Imbalanced, approx 15-20% churn):")
    print(df['Churn'].value_counts(normalize=True))
    
    return df

# Run the data generation
df = generate_synthetic_data(NUM_CUSTOMERS)

Generating 15000 high-precision synthetic Ethio Telecom customer records...
✅ Data generation complete. Saved to ethio_telecom_high_precision_churn_data.csv

DataFrame Head:
                             CustomerID                       Region  \
0  8fbb945c-8e57-4d32-87bb-57cc7f38c47d   Addis Ababa (High Density)   
1  582f68aa-c94c-4548-9e06-27c2cf3539ee  Regional City (Mid Density)   
2  f10db765-7e5a-4718-9059-6ee5f66449f2  Regional City (Mid Density)   
3  b96ffe6a-2555-4816-968e-f9322b440a82     Rural Area (Low Density)   
4  b04bb9f9-d01c-4fab-be7e-093cd297540b  Regional City (Mid Density)   

    Contract_Type           Service_Plan  Tenure_Months  Monthly_Charges_ETB  \
0         6-Month  Data/Internet + Voice             46               631.96   
1  Month-to-month  Data/Internet + Voice             72               527.19   
2        24-Month  Data/Internet + Voice             32               596.04   
3        12-Month     Basic Mobile Voice             54               521