In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta, time

In [3]:
num_users = 50
transactions_per_user = 20
total_transactions = num_users * transactions_per_user

In [4]:
user_ids = [f"User_{i:03d}" for i in range(num_users)]

locations = ['gwalior' , 'bhopal' , 'mumbai' , 'delhi' , 'jaipur' , 'patna' , 'kolkata' , 'noida' , 'deharadhun' , 'new york']

Assign Home Location to Each User

In [5]:

user_home_location = {
    user: random.choice(locations) for user in user_ids
}

dataset generation

In [6]:
data = []
transaction_id = 1
base_time = datetime.now() - timedelta(days=365)

for user in user_ids:
    current_time = base_time
    
    for _ in range(transactions_per_user):
        
        if random.random() < 0.15:
            hour = random.choice([2, 3, 4])
        else:
            hour = random.randint(6, 22)
        
        minute = random.randint(0, 59)
        second = random.randint(0, 59)
        
        current_time += timedelta(minutes=random.randint(30, 180))
        timestamp = current_time.replace(hour=hour, minute=minute, second=second)
        
        if random.random() < 0.10:
            amount = random.randint(100001, 500000)
        else:
            amount = random.randint(1000, 99999)
        
        if random.random() < 0.80:
            location = user_home_location[user]
        else:
            location = random.choice(locations)
        
        data.append([
            transaction_id,
            user,
            timestamp,
            amount,
            location
        ])
        
        transaction_id += 1

Create DataFrame

In [7]:
transactions_df = pd.DataFrame(
    data,
    columns=["transaction_id", "user_id", "timestamp", "amount", "location"]
)

In [8]:
transactions_df.head()

Unnamed: 0,transaction_id,user_id,timestamp,amount,location
0,1,User_000,2025-01-16 02:39:07.447885,5938,new york
1,2,User_000,2025-01-16 19:53:03.447885,46714,gwalior
2,3,User_000,2025-01-16 16:24:31.447885,93023,new york
3,4,User_000,2025-01-16 06:31:30.447885,90631,new york
4,5,User_000,2025-01-16 16:51:53.447885,283869,new york


In [9]:
transactions_df = transactions_df.sort_values(
    by=["user_id", "timestamp"]
).reset_index(drop=True)

In [10]:
transactions_df["rule1_fraud"] = transactions_df["amount"] > 100000

Rule 2 â€” Location Different from Last 3 Transactions

In [11]:
def detect_location_change(user_df):
    locations = user_df["location"].tolist()
    flags = []

    for i in range(len(locations)):
        if i < 3:
            flags.append(False)
        else:
            flags.append(locations[i] not in locations[i-3:i])

    return pd.Series(flags, index=user_df.index)

In [12]:
transactions_df["rule2_fraud"] = (
    transactions_df.groupby("user_id", group_keys=False)
    .apply(detect_location_change)
)

  .apply(detect_location_change)


Rule 3  Transaction Between 2 AM and 4 AM


In [13]:
transactions_df["hour"] = transactions_df["timestamp"].dt.hour
transactions_df["rule3_fraud"] = (
    (transactions_df["hour"] >= 2) &
    (transactions_df["hour"] <= 4)
)

In [14]:
transactions_df["is_fraudulent"] = (
    (transactions_df["rule1_fraud"] & transactions_df["rule2_fraud"]) |
    (transactions_df["rule1_fraud"] & transactions_df["rule3_fraud"]) |
    (transactions_df["rule2_fraud"] & transactions_df["rule3_fraud"])
)

In [15]:
transactions_df["fraud_label"] = transactions_df["is_fraudulent"].map(
    {True: "Fraudulent", False: "Legitimate"}
)

In [16]:
print(transactions_df["fraud_label"].value_counts())

fraud_label
Legitimate    955
Fraudulent     45
Name: count, dtype: int64


In [17]:
transactions_df_before = transactions_df.copy()
transactions_df_before.to_csv(
    "dataset_after.csv",
    index=False
)