In [1]:
# Synthetic India-Specific Fraud Detection Dataset Generator (250,000 rows)

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)

n_rows = 250_000

# Transaction Types
transaction_types = ['UPI', 'IMPS', 'NEFT', 'RTGS', 'Wallet', 'ATM']
devices = ['Mobile', 'Web', 'POS', 'ATM']
cities = ['Delhi', 'Mumbai', 'Bangalore', 'Hyderabad', 'Chennai', 'Kolkata',
          'Pune', 'Ahmedabad', 'Jaipur', 'Lucknow']

# Generate timestamps
start_date = datetime(2023, 1, 1)
timestamps = [start_date + timedelta(minutes=random.randint(0, 525600)) for _ in range(n_rows)]

# Generate sender and receiver IDs
sender_ids = [f"UID{random.randint(100000, 999999)}" for _ in range(n_rows)]
receiver_ids = [f"UID{random.randint(100000, 999999)}" for _ in range(n_rows)]

# Generate other features
amounts = np.round(np.random.exponential(scale=2000, size=n_rows), 2)
transaction_type = np.random.choice(transaction_types, size=n_rows)
location_sender = np.random.choice(cities, size=n_rows)
location_receiver = np.random.choice(cities, size=n_rows)
device_type = np.random.choice(devices, size=n_rows)
is_international = np.random.choice([0, 1], size=n_rows, p=[0.97, 0.03])
transaction_hour = [ts.hour for ts in timestamps]

# Generate fraud labels based on rules
is_fraud = []
for i in range(n_rows):
    fraud_risk = 0.01  # base fraud rate
    if transaction_hour[i] in [0, 1, 2, 3]:
        fraud_risk += 0.02
    if transaction_type[i] in ['Wallet', 'ATM']:
        fraud_risk += 0.015
    if is_international[i] == 1:
        fraud_risk += 0.05
    if location_sender[i] != location_receiver[i]:
        fraud_risk += 0.01
    if amounts[i] > 100000:
        fraud_risk += 0.03
    is_fraud.append(1 if random.random() < fraud_risk else 0)

# Create DataFrame
df = pd.DataFrame({
    'timestamp': timestamps,
    'transaction_type': transaction_type,
    'sender_id': sender_ids,
    'receiver_id': receiver_ids,
    'amount': amounts,
    'location_sender': location_sender,
    'location_receiver': location_receiver,
    'device_type': device_type,
    'is_international': is_international,
    'transaction_hour': transaction_hour,
    'is_fraud': is_fraud
})

# Save to CSV
df.to_csv('synthetic_india_fraud_dataset_250k.csv', index=False)

# Basic stats
print("\nSynthetic dataset generated successfully!\n")
print(df['is_fraud'].value_counts(normalize=True))
df.head()




Synthetic dataset generated successfully!

is_fraud
0    0.971552
1    0.028448
Name: proportion, dtype: float64


Unnamed: 0,timestamp,transaction_type,sender_id,receiver_id,amount,location_sender,location_receiver,device_type,is_international,transaction_hour,is_fraud
0,2023-03-23 01:39:00,ATM,UID252169,UID165312,938.54,Jaipur,Pune,Mobile,0,1,0
1,2023-01-19 05:05:00,RTGS,UID462320,UID209404,6020.24,Ahmedabad,Pune,POS,0,5,0
2,2023-07-20 06:29:00,Wallet,UID936861,UID350052,2633.49,Jaipur,Kolkata,POS,0,6,1
3,2023-06-28 07:47:00,ATM,UID147211,UID992957,1825.89,Pune,Delhi,Mobile,0,7,0
4,2023-06-12 12:53:00,RTGS,UID137407,UID306605,339.25,Kolkata,Bangalore,Web,0,12,0


In [2]:
from google.colab import files
files.download('synthetic_india_fraud_dataset_250k.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>