<a href="https://colab.research.google.com/github/Attabeezy/sequential-crm-for-dce/blob/main/syn-data-gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SYNTHETIC DATA GENERATOR

Generates synthetic mobile money (MoMo) transaction data through the following steps:

- **Load example data:** Load and prepare real transaction data to infer distributions and transaction types.
- **Generate synthetic users:** Create a list of unique user IDs.
- **Generate user transactions:** Simulate transactions for each user, including timestamps, amounts, types, balances, and fraud flags.
- **Synthesize full dataset:** Combine individual user transactions into a single dataset and sort chronologically.
- **Export:** Save the synthetic dataset to a CSV file.

In [2]:
!pip install faker
!pip install openpyxl

Collecting faker
  Downloading faker-37.12.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.12.0-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m1.8/2.0 MB[0m [31m54.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.12.0


In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from faker import Faker

fake = Faker()

## CONFIGURATION

In [4]:

SYNTH_CONFIG = {
    "months": 6,
    "num_users": 2000,
    "avg_tx_per_day": 1.5,
    "fraud_rate": 0.005,  # 0.5% transactions marked as fraudulent
    "seed": 42
}
np.random.seed(SYNTH_CONFIG["seed"])

## STEP 1: LOAD EXAMPLE DATA

In [5]:
path = "/content/MomoStatementReport.xlsx"
df = pd.read_excel(path)

# Rename and map relevant columns
colmap = {
    "timestamp": "TRANSACTION DATE",
    "tx_type": "TRANS. TYPE",
    "amount": "AMOUNT",
    "balance": "BAL AFTER"
}

# Clean up and prepare sample
df = df.rename(columns={v: k for k, v in colmap.items()})
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df = df.dropna(subset=["timestamp", "amount"])

# Fit distribution of transaction amounts
amounts = df["amount"].astype(float)
mu, sigma = np.log(amounts.mean()), np.log(amounts.std() + 1)

tx_types = df["tx_type"].dropna().unique().tolist()
if len(tx_types) == 0:
    tx_types = ["CASH IN", "CASH OUT", "TRANSFER", "PAYMENT"]

# Infer date range
start_date = datetime.now() - timedelta(days=30 * SYNTH_CONFIG["months"])
end_date = datetime.now()


## STEP 2: GENERATE SYNTHETIC USERS

In [6]:
user_ids = [f"U{str(i).zfill(5)}" for i in range(1, SYNTH_CONFIG["num_users"] + 1)]

def generate_user_transactions(user_id):
    """Simulate transactions for one user."""
    # Randomize how many transactions user performs per day
    tx_per_day = max(1, int(np.random.normal(SYNTH_CONFIG["avg_tx_per_day"], 0.5)))
    num_days = (end_date - start_date).days
    total_tx = tx_per_day * num_days

    # Generate timestamps
    timestamps = [
        start_date + timedelta(days=int(i / tx_per_day)) +
        timedelta(hours=np.random.randint(6, 22), minutes=np.random.randint(0, 60))
        for i in range(total_tx)
    ]

    # Generate amounts
    amounts = np.random.lognormal(mu, sigma, total_tx).round(2)

    # Randomly assign transaction types
    tx_type = np.random.choice(tx_types, total_tx, p=[1/len(tx_types)]*len(tx_types))

    # Simulate balance evolution
    balance = np.cumsum(np.where(np.isin(tx_type, ["CASH IN", "SALARY", "DEPOSIT"]), amounts, -amounts))
    balance = balance - balance.min() + np.random.uniform(10, 100)  # shift to positive balances

    # Inject fraud pattern
    fraud_flags = np.random.choice(
        [0, 1],
        size=total_tx,
        p=[1 - SYNTH_CONFIG["fraud_rate"], SYNTH_CONFIG["fraud_rate"]]
    )

    # Assemble dataframe
    return pd.DataFrame({
        "user_id": user_id,
        "timestamp": timestamps,
        "tx_type": tx_type,
        "amount": amounts,
        "balance": balance,
        "fraud_flag": fraud_flags,
    })


## STEP 3: SYNTHESIZE FULL DATASET

In [7]:
synthetic_df = pd.concat(
    [generate_user_transactions(uid) for uid in user_ids],
    ignore_index=True
)

# Sort chronologically per user
synthetic_df = synthetic_df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)


## STEP 4: EXPORT

In [8]:
synthetic_df.to_csv("synthetic_momo_transactions.csv", index=False)
print("✅ Synthetic dataset generated and saved as 'synthetic_momo_transactions.csv'")
print(f"Total transactions: {len(synthetic_df):,}")
print(synthetic_df.head(10))

✅ Synthetic dataset generated and saved as 'synthetic_momo_transactions.csv'
Total transactions: 416,700
  user_id                  timestamp   tx_type  amount       balance  \
0  U00001 2025-05-05 01:41:58.455366     DEBIT   16.48  1.406548e+07   
1  U00001 2025-05-06 03:54:58.455366     DEBIT   64.11  1.406541e+07   
2  U00001 2025-05-06 22:31:58.455366   CASH_IN  380.44  1.406503e+07   
3  U00001 2025-05-07 17:56:58.455366   CASH_IN    0.39  1.406503e+07   
4  U00001 2025-05-09 01:44:58.455366   CASH_IN    0.29  1.406503e+07   
5  U00001 2025-05-09 23:26:58.455366   CASH_IN  156.76  1.406487e+07   
6  U00001 2025-05-10 19:13:58.455366     DEBIT   72.76  1.406480e+07   
7  U00001 2025-05-11 22:36:58.455366  TRANSFER   62.09  1.406474e+07   
8  U00001 2025-05-12 21:26:58.455366     DEBIT   86.14  1.406465e+07   
9  U00001 2025-05-13 16:57:58.455366  TRANSFER    2.60  1.406465e+07   

   fraud_flag  
0           0  
1           0  
2           0  
3           0  
4           0  
5     