In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

#  1. Setup Parameters and Data Generation
np.random.seed(42)
NUM_TRANSACTIONS = 100000
START_DATE = datetime(2025, 1, 1)

# Define regional weights
REGIONAL_WEIGHTS = {
    'Addis Ababa': 0.50,
    'Oromia': 0.35,
    'South Ethiopia': 0.15,
}
REGIONS = list(REGIONAL_WEIGHTS.keys())
REGION_PROBABILITIES = list(REGIONAL_WEIGHTS.values())

# Generate unique IDs
Customer_IDs = ['CUST' + str(i).zfill(6) for i in range(15000)]
Agent_IDs = ['AGENT' + str(i).zfill(4) for i in range(500)]

# Initialize DataFrame
df = pd.DataFrame()
df['Transaction_ID'] = ['TXN' + str(i).zfill(8) for i in range(NUM_TRANSACTIONS)]
df['Transaction_Time'] = [START_DATE + timedelta(minutes=np.random.randint(0, 43200)) for _ in range(NUM_TRANSACTIONS)]

#  2. Core Transaction Features 
transaction_types = ['P2P_Transfer', 'Agent_Cash_In', 'Agent_Cash_Out', 'B2C_Payment', 'Airtime_Purchase', 'Bill_Payment']
df['Transaction_Type'] = np.random.choice(transaction_types, size=NUM_TRANSACTIONS, p=[0.3, 0.25, 0.2, 0.1, 0.1, 0.05])
df['Sender_ID'] = np.random.choice(Customer_IDs, size=NUM_TRANSACTIONS)

# Generate transaction amounts
def generate_amount(txn_type):
    if txn_type == 'P2P_Transfer':
        return np.random.uniform(10, 5000)
    elif txn_type in ['Agent_Cash_In', 'Agent_Cash_Out']:
        return np.random.uniform(100, 15000)
    elif txn_type == 'Airtime_Purchase':
        return np.random.choice([10, 20, 50, 100, 500])
    else:
        return np.random.uniform(50, 8000)

df['Amount_ETB'] = df['Transaction_Type'].apply(generate_amount).round(2)
df['System_Fee_ETB'] = (df['Amount_ETB'] * np.random.uniform(0.005, 0.02)).round(2)

# --- 3. Region and Assurance Features ---
df['Sender_Region'] = np.random.choice(REGIONS, size=NUM_TRANSACTIONS, p=REGION_PROBABILITIES)
df['Receiver_Region'] = np.random.choice(REGIONS, size=NUM_TRANSACTIONS, p=REGION_PROBABILITIES)
df['Is_Cross_Region'] = (df['Sender_Region'] != df['Receiver_Region']).astype(int)

# Assign Receiver ID (Customer or Agent)
agent_txn_mask = df['Transaction_Type'].isin(['Agent_Cash_In', 'Agent_Cash_Out'])
df.loc[agent_txn_mask, 'Receiver_ID'] = np.random.choice(Agent_IDs, size=agent_txn_mask.sum())
df.loc[~agent_txn_mask, 'Receiver_ID'] = np.random.choice(Customer_IDs, size=(~agent_txn_mask).sum())

# Simulate Billing System Status with Regional Risk
def get_billing_status(region):
    if region == 'Addis Ababa':
        return np.random.choice(['SUCCESS', 'RATING_ERROR', 'TIMEOUT'], p=[0.99, 0.005, 0.005])
    elif region == 'Oromia':
        return np.random.choice(['SUCCESS', 'RATING_ERROR', 'TIMEOUT'], p=[0.96, 0.02, 0.02])
    elif region == 'South Ethiopia':
        return np.random.choice(['SUCCESS', 'RATING_ERROR', 'TIMEOUT'], p=[0.975, 0.015, 0.010])
    return 'SUCCESS'

df['Billing_System_Status'] = df['Sender_Region'].apply(get_billing_status)

# Initial Fraud Flag
df['Is_Fraud'] = 0
df['Fraud_Vector'] = 'None'


# --- 4. Inject Realistic Fraud/Leakage Patterns (ROBUST SAMPLING) ---

# **Pattern A: SIM Swap/Account Takeover Fraud**
desired_rate_a = 0.003
mask_a = df[
    (df['Transaction_Type'] == 'P2P_Transfer') &
    (df['Amount_ETB'] > 4500) &
    (df['Sender_Region'] == 'Addis Ababa')
]
actual_samples_a = min(int(NUM_TRANSACTIONS * desired_rate_a), len(mask_a))
sim_swap_indices = mask_a.sample(actual_samples_a, replace=False).index
df.loc[sim_swap_indices, 'Is_Fraud'] = 1
df.loc[sim_swap_indices, 'Fraud_Vector'] = 'SIM_SWAP_P2P'

# **Pattern B: Agent Collusion/Cash-Out Fraud**
desired_rate_b = 0.002
mask_b = df[
    (df['Transaction_Type'] == 'Agent_Cash_Out') &
    (df['Amount_ETB'] % 100 == 0) &
    (df['Amount_ETB'] > 5000) &
    (df['Sender_Region'].isin(['Oromia', 'South Ethiopia']))
]
actual_samples_b = min(int(NUM_TRANSACTIONS * desired_rate_b), len(mask_b))
agent_fraud_indices = mask_b.sample(actual_samples_b, replace=False).index
df.loc[agent_fraud_indices, 'Is_Fraud'] = 1
df.loc[agent_fraud_indices, 'Fraud_Vector'] = 'AGENT_COLLUSION_CASH_OUT'

# **Pattern C: Rating/Billing Leakage (System/Network Errors)**
desired_rate_c = 0.005
mask_c = df[
    (df['Billing_System_Status'] != 'SUCCESS') &
    (df['Transaction_Type'].isin(['Agent_Cash_In', 'Agent_Cash_Out'])) &
    (df['Is_Fraud'] == 0) # Only apply leakage to transactions not already marked as fraud
]
actual_samples_c = min(int(NUM_TRANSACTIONS * desired_rate_c), len(mask_c))
leakage_indices = mask_c.sample(actual_samples_c, replace=False).index
df.loc[leakage_indices, 'Is_Fraud'] = 1
df.loc[leakage_indices, 'Fraud_Vector'] = 'RATING_BILLING_LEAKAGE'

# Re-assign 'None' to all remaining non-fraudulent transactions
df.loc[df['Fraud_Vector'] == 'None', 'Fraud_Vector'] = 'None'


# --- 5. Final Output and Save ---
print(f"Total Transactions Generated: {len(df)}")
print(f"Total Fraud/Leakage Cases: {df['Is_Fraud'].sum()} ({round(df['Is_Fraud'].mean()*100, 2)}%)")
print("\nFraud Vector Distribution:")
print(df['Fraud_Vector'].value_counts())

# Save the dataset
FILE_NAME = 'safaricom_ethiopia_dfs_regional_assurance_data_V3.csv'
df.to_csv(FILE_NAME, index=False)
print(f"\n--- Dataset successfully saved as {FILE_NAME} ---")

Total Transactions Generated: 100000
Total Fraud/Leakage Cases: 803 (0.8%)

Fraud Vector Distribution:
Fraud_Vector
None                        99197
RATING_BILLING_LEAKAGE        500
SIM_SWAP_P2P                  300
AGENT_COLLUSION_CASH_OUT        3
Name: count, dtype: int64

--- Dataset successfully saved as safaricom_ethiopia_dfs_regional_assurance_data_V3.csv ---
