In [57]:
import numpy as np
import pandas as pd
import random 

In [None]:
num_users = 50
transactions_per_user = 20
total_transactions = num_users * transactions_per_user

In [59]:
user_ids = [f'User_{i:03d}' for i in range(num_users)]
print(user_ids)

['User_000', 'User_001', 'User_002', 'User_003', 'User_004', 'User_005', 'User_006', 'User_007', 'User_008', 'User_009', 'User_010', 'User_011', 'User_012', 'User_013', 'User_014', 'User_015', 'User_016', 'User_017', 'User_018', 'User_019', 'User_020', 'User_021', 'User_022', 'User_023', 'User_024', 'User_025', 'User_026', 'User_027', 'User_028', 'User_029', 'User_030', 'User_031', 'User_032', 'User_033', 'User_034', 'User_035', 'User_036', 'User_037', 'User_038', 'User_039', 'User_040', 'User_041', 'User_042', 'User_043', 'User_044', 'User_045', 'User_046', 'User_047', 'User_048', 'User_049']


In [60]:
locations = ['gwalior' , 'bhopal' , 'mumbai' , 'delhi' , 'jaipur' , 'patna' , 'kolkata' , 'noida' , 'deharadhun' , 'new york']

In [61]:
data=[]

starting timestamp: exactly 1 year before now

In [62]:
from datetime import datetime , timedelta
current_time = datetime.now() - timedelta(days=365)

#### dataset generation 

In [63]:
for i in range(num_transactions):
    user_id = random.choice(user_ids)

    # Generate timestamp, ensuring some are in the 2-4 AM window and varied over time
    if random.random() < 0.15: # 15% chance to be in fraud time window
        transaction_hour = random.choice([2, 3])
        minute = random.randint(0, 59)
        second = random.randint(0, 59)
        dt = current_time.replace(hour=transaction_hour, minute=minute, second=second)
        current_time += timedelta(minutes=random.randint(1, 10))
    else:
        dt = current_time + timedelta(minutes=random.randint(1, 120)) # Increment time for next transaction
        current_time = dt # Update seed for next transaction
        # Randomize hour if not in fraud window
        dt = dt.replace(hour=random.randint(0, 23))

    # Generate amount, ensuring some are > Rs. 1,00,000
    if random.random() < 0.10: # 10% chance for high amount
        amount = random.randint(100001, 500000) # Amounts > 100,000
    else:
        amount = random.randint(1000, 99999) # Normal amounts

    location = random.choice(locations)
    data.append([i + 1, user_id, dt, amount, location])

In [64]:
transactions_df = pd.DataFrame(data, columns=['transaction_id', 'user_id', 'timestamp', 'amount', 'location'])
transactions_df

Unnamed: 0,transaction_id,user_id,timestamp,amount,location
0,1,User_015,2025-01-16 08:32:23.284390,90615,noida
1,2,User_005,2025-01-16 21:12:23.284390,387966,jaipur
2,3,User_020,2025-01-16 02:59:08.284390,6406,deharadhun
3,4,User_033,2025-01-16 18:22:23.284390,48295,gwalior
4,5,User_028,2025-01-16 17:22:23.284390,58038,mumbai
...,...,...,...,...,...
995,996,User_046,2025-02-22 04:45:23.284390,95969,delhi
996,997,User_039,2025-02-22 08:01:23.284390,31085,gwalior
997,998,User_000,2025-02-22 03:42:23.284390,98287,mumbai
998,999,User_032,2025-02-22 09:24:23.284390,74663,deharadhun


In [65]:
transactions_df["timestamp"] = pd.to_datetime(transactions_df["timestamp"])
transactions_df.head()

Unnamed: 0,transaction_id,user_id,timestamp,amount,location
0,1,User_015,2025-01-16 08:32:23.284390,90615,noida
1,2,User_005,2025-01-16 21:12:23.284390,387966,jaipur
2,3,User_020,2025-01-16 02:59:08.284390,6406,deharadhun
3,4,User_033,2025-01-16 18:22:23.284390,48295,gwalior
4,5,User_028,2025-01-16 17:22:23.284390,58038,mumbai


In [66]:
transactions_df = transactions_df.sort_values(by=["user_id", "timestamp"]).reset_index(drop=True)

In [67]:
transactions_df["rule_amount"] = transactions_df["amount"] > 100000

In [68]:
def location_diff_last_3(user_df):
    locations = user_df["location"].tolist()
    flags = []

    for i in range(len(locations)):
        if i < 3:
            flags.append(False)
        else:
            last_3_locations = locations[i-3:i]
            flags.append(locations[i] not in last_3_locations)

    return flags

In [69]:
transactions_df["rule_location"] = (
    transactions_df.groupby("user_id", group_keys=False)
      .apply(location_diff_last_3)
)

  .apply(location_diff_last_3)


In [70]:
from datetime import time
def is_suspicious_time(ts):
    return time(2, 0) <= ts.time() <= time(4, 0)

transactions_df["rule_time"] = transactions_df["timestamp"].apply(is_suspicious_time)

In [71]:
transactions_df["is_fraud"] = (
    transactions_df["rule_amount"] &
    transactions_df["rule_location"] &
    transactions_df["rule_time"]
)


In [72]:
transactions_df["fraud_label"] = transactions_df["is_fraud"].apply(
    lambda x: "Fraudulent" if x else "Legitimate"
)

In [73]:
transactions_df[[
    "transaction_id",
    "user_id",
    "timestamp",
    "amount",
    "location",
    "fraud_label"
]]

Unnamed: 0,transaction_id,user_id,timestamp,amount,location,fraud_label
0,16,User_000,2025-01-17 02:32:25.284390,29158,noida,Legitimate
1,66,User_000,2025-01-19 20:42:23.284390,3147,patna,Legitimate
2,147,User_000,2025-01-22 12:41:23.284390,38411,bhopal,Legitimate
3,202,User_000,2025-01-24 00:03:23.284390,33090,delhi,Legitimate
4,216,User_000,2025-01-25 09:59:23.284390,30929,gwalior,Legitimate
...,...,...,...,...,...,...
995,745,User_049,2025-02-13 15:35:23.284390,36355,deharadhun,Legitimate
996,830,User_049,2025-02-16 22:43:23.284390,56074,new york,Legitimate
997,925,User_049,2025-02-20 00:28:23.284390,42391,deharadhun,Legitimate
998,927,User_049,2025-02-20 02:37:23.284390,41459,deharadhun,Legitimate
