In [1]:
!pip install pandas numpy Faker




[notice] A new release of pip is available: 23.1.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker for realistic data generation
fake = Faker()

# --- Configuration ---
NUM_RECORDS = 50000  # Total number of records to generate
START_DATE = datetime(2024, 8, 1)

# Safaricom-specific/Ethiopian/Kenyan MSISDN Prefixes (Simulated)
# Ethio Telecom uses 9 and Safaricom in Kenya uses 7. Let's mix realistic formats.
# We'll stick to a simple 10-digit format for consistency in the database schema.
MSISDN_PREFIXES = ['91', '92', '93', '70', '71'] 

# Service/Charging Elements (Realistic Telco Components)
CHARGING_ELEMENTS = {
    'Voice': ['MSC_01', 'MSC_02'], 
    'SMS': ['SMSC_03', 'SMSC_04'], 
    'Data': ['GGSN_01', 'GGSN_02'],
    'Value_Added': ['USSD_GATE', 'M_PESA_G2'] 
}

In [9]:
# --- 1. Tariff Reference Data (Static Lookup) ---
TARIFF_DATA = {
    'rate_plan_id': ['POST_01', 'PRE_02', 'PROMO_NIGHT', 'CORP_PLAN'],
    'Voice_Rate': [0.05, 0.07, 0.02, 0.04],  # Birr/Minute
    'Data_Rate': [0.01, 0.015, 0.005, 0.008], # Birr/MB
    'SMS_Rate': [0.10, 0.12, 0.08, 0.09]     # Birr/SMS
}
df_tariff = pd.DataFrame(TARIFF_DATA).set_index('rate_plan_id')


In [10]:
# 2. CDR and Billing Data Generation 
# Lists to store record data
cdr_records = []
billing_records = []

# Generate all common fields for both CDR and Billing
for i in range(NUM_RECORDS):
    cdr_id = f"CDR_{i:06d}"
    
    # Generate random timestamp within a 2-day window
    timestamp = START_DATE + timedelta(seconds=np.random.randint(0, 48*3600))
    
    # MSISDN_A (Source number)
    msisdn_a = np.random.choice(MSISDN_PREFIXES) + str(fake.unique.random_int(min=10000000, max=99999999))
    
    # Service Type (Distribution biased towards Data)
    service_type = np.random.choice(['Voice', 'SMS', 'Data', 'Data', 'Voice'], p=[0.2, 0.1, 0.4, 0.2, 0.1])
    
    # Charging Element (Randomly pick a relevant element)
    charging_element_id = np.random.choice(CHARGING_ELEMENTS.get(service_type, ['MISC_00']))
    
    # Rate Plan (Randomly assigned)
    rate_plan_id = np.random.choice(df_tariff.index)
    
    # Usage Volume (Duration for Voice, Volume for Data/SMS)
    if service_type == 'Voice':
        duration_volume = np.random.randint(10, 100) # seconds
    elif service_type == 'Data':
        duration_volume = np.random.randint(1, 100) # MB
    else: # SMS/Value_Added
        duration_volume = np.random.randint(1, 5) # Count/Transaction
    
    # --- CALCULATE TRUE CHARGE (CDR Side - True Revenue) ---
    rate_row = df_tariff.loc[rate_plan_id]
    
    if service_type == 'Voice':
        # Voice charged per minute (Volume / 60) * Rate
        true_charge = (duration_volume / 60) * rate_row['Voice_Rate']
    elif service_type == 'Data':
        true_charge = duration_volume * rate_row['Data_Rate']
    else: # SMS/Value_Added
        true_charge = duration_volume * rate_row['SMS_Rate']
        
    # --- INTRODUCE SIMULATED LEAKAGE/LOSS (RA SCENARIO) ---
    
    # 1. CDR Loss (CDR is generated, but will be 'lost' for billing)
    # 1.5% of CDRs are lost/understated 
    true_charge_cdr = true_charge
    
    # 2. Rating/Billing Error (Rating system charges less than True Charge)
    # Billing is always correct for 90% of records, but 10% have a 10-25% underselling error.
    billed_charge = true_charge
    
    if np.random.rand() < 0.10: # 10% chance of a rating leakage
        underselling_factor = np.random.uniform(0.75, 0.90) # 75% to 90% of true charge
        billed_charge = true_charge * underselling_factor

    # Store records
    common_data = {
        'cdr_id': cdr_id,
        'timestamp': timestamp,
        'msisdn_a': msisdn_a,
        'service_type': service_type,
        'charging_element_id': charging_element_id,
        'rate_plan_id': rate_plan_id,
        'duration_volume': duration_volume
    }
    
    # CDR (True Revenue)
    cdr_data = common_data.copy()
    cdr_data['true_charge'] = round(true_charge_cdr, 4)
    cdr_records.append(cdr_data)

    # Billing (Reported Revenue)
    billing_data = common_data.copy()
    billing_data['billed_charge'] = round(billed_charge, 4)
    billing_records.append(billing_data)


# Create DataFrames
df_cdr = pd.DataFrame(cdr_records)
df_billing = pd.DataFrame(billing_records)

# Introduce a small CDR loss (e.g., 500 records missing from the billing side)
# We will simulate this by keeping the CDRs complete, but detecting the difference
# later in the RA query (assuming a complete CDR file vs a smaller Billing file).

In [12]:
# Create DataFrames
df_cdr = pd.DataFrame(cdr_records)
df_billing = pd.DataFrame(billing_records)

# --------------------------------------------------------------------------
# CRITICAL CORRECTION: INTRODUCE CDR LOSS
# --------------------------------------------------------------------------

# 1. Define the loss percentage (e.g., 1.5% of records are lost/missing)
LOSS_PERCENTAGE = 0.015 
NUM_TO_LOSE = int(len(df_billing) * LOSS_PERCENTAGE)

# 2. Randomly select the CDR IDs that will be 'lost' from the billing system
np.random.seed(42) # Use a seed for reproducibility
cdr_ids_to_lose = np.random.choice(
    df_billing['cdr_id'], 
    size=NUM_TO_LOSE, 
    replace=False
)

# 3. Drop the selected 'lost' CDRs from the billing DataFrame
df_billing_lost = df_billing[~df_billing['cdr_id'].isin(cdr_ids_to_lose)].copy()

print(f"CDR Loss Simulation: {NUM_TO_LOSE:,} records ({LOSS_PERCENTAGE*100}%) were intentionally 'lost' from the billing data.")

# --- 3. Save to CSV Files ---
# Save the COMPLETE CDR file and the REDUCED Billing file
df_cdr.to_csv('telecom_cdr_data.csv', index=False)
df_billing_lost.to_csv('telecom_billing_data.csv', index=False) # Use the reduced DataFrame
df_tariff.to_csv('telecom_tariff_reference.csv') 
print(f"Data Generation Complete. CDRs: {len(df_cdr):,}, Billed Records: {len(df_billing_lost):,}.")

CDR Loss Simulation: 750 records (1.5%) were intentionally 'lost' from the billing data.
Data Generation Complete. CDRs: 50,000, Billed Records: 49,250.
