<a href="https://colab.research.google.com/github/asyuuuuu/Machine_Learning/blob/main/scam_detection_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta
import json

def generate_scam_dataset(num_samples=1000):
    """
    Generate a synthetic dataset for scam detection training
    """
    # Common scam keywords and patterns
    scam_keywords = [
        "urgent", "winner", "congratulations", "inheritance", "investment opportunity",
        "bank transfer", "verify account", "cryptocurrency", "bitcoin", "profit guaranteed",
        "quick return", "limited time", "claim prize", "lottery winner", "overseas transaction"
    ]

    # Communication channels
    channels = ["SMS", "WhatsApp", "Phone Call", "Email", "Social Media"]

    # Transaction patterns
    transaction_types = ["Bank Transfer", "E-wallet", "Cryptocurrency", "Cash Deposit", "Wire Transfer"]

    # Scam categories
    scam_categories = {
        "Macau Scam": ["police", "court", "tax", "government", "authority", "arrest warrant"],
        "Investment Scam": ["crypto", "forex", "stock", "high returns", "investment scheme"],
        "Love Scam": ["dating", "overseas", "emergency money", "medical emergency", "travel expenses"],
        "E-commerce Scam": ["cheap deal", "limited stock", "pre-order", "huge discount", "exclusive offer"],
        "Loan Scam": ["quick approval", "no documentation", "instant cash", "low interest", "guaranteed approval"]
    }

    data = []

    # Generate synthetic data
    for _ in range(num_samples):
        # Randomly decide if this is a scam or legitimate case
        is_scam = random.choice([True, False])

        # Select scam category if it's a scam
        category = random.choice(list(scam_categories.keys())) if is_scam else "Legitimate"

        # Generate message content
        message_content = ""
        if is_scam:
            # Add category-specific keywords
            keywords = scam_categories[category]
            message_content += random.choice(keywords) + " "
            # Add general scam keywords
            message_content += random.choice(scam_keywords) + " "
        else:
            message_content = "Regular transaction notification"

        # Generate timestamp within last 6 months
        timestamp = datetime.now() - timedelta(
            days=random.randint(0, 180),
            hours=random.randint(0, 23),
            minutes=random.randint(0, 59)
        )

        # Generate transaction amount (higher for scams)
        amount = random.randint(5000, 50000) if is_scam else random.randint(100, 5000)

        # Create data entry
        entry = {
            "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
            "transaction_id": f"TRX{random.randint(10000, 99999)}",
            "amount_myr": amount,
            "channel": random.choice(channels),
            "transaction_type": random.choice(transaction_types),
            "message_content": message_content,
            "category": category,
            "is_scam": is_scam,
            "risk_indicators": {
                "unusual_amount": amount > 5000,
                "suspicious_keywords": len([k for k in scam_keywords if k in message_content.lower()]),
                "time_pressure": "urgent" in message_content.lower(),
                "overseas_connection": "overseas" in message_content.lower(),
                "requesting_personal_info": "verify" in message_content.lower()
            },
            "victim_demographics": {
                "age_group": random.choice(["18-25", "26-35", "36-45", "46-55", "56+"]),
                "location": random.choice([
                    "Kuala Lumpur", "Selangor", "Penang", "Johor", "Sabah",
                    "Sarawak", "Perak", "Melaka", "Pahang", "Terengganu"
                ])
            }
        }

        data.append(entry)

    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

# Generate example dataset
df = generate_scam_dataset(1000)

# Save to different formats
df.to_csv('malaysia_scam_dataset.csv', index=False)
df.to_json('malaysia_scam_dataset.json', orient='records')

# Print sample statistics
print("\nDataset Statistics:")
print(f"Total samples: {len(df)}")
print(f"Scam cases: {len(df[df['is_scam']])}")
print(f"Legitimate cases: {len(df[~df['is_scam']])}")
print("\nCategory distribution:")
print(df['category'].value_counts())


Dataset Statistics:
Total samples: 1000
Scam cases: 538
Legitimate cases: 462

Category distribution:
category
Legitimate         462
Love Scam          120
Loan Scam          117
E-commerce Scam    106
Macau Scam          98
Investment Scam     97
Name: count, dtype: int64


In [4]:
df.head()

Unnamed: 0,transaction_id,timestamp,amount,scam_type,hour_of_day,day_of_week,sender_location,receiver_location,transaction_method,new_recipient,multiple_transactions,overseas_transaction,sender_account_age,receiver_account_age,scam_probability,is_scam,transaction_description
0,1,2024-08-12 03:34:34.274295,152109.449626,Impersonation,3,0,Johor,Kuala Lumpur,ATM,0,0,0,2469,3040,0.5,0,Personal Transfer
1,2,2023-12-10 03:34:34.274383,10642.930291,Investment,3,6,Kuala Lumpur,Kuala Lumpur,Mobile Wallet,1,0,0,2541,1438,0.7,1,Crypto Investment Return
2,3,2024-02-26 03:34:34.274396,82076.720276,Love,3,0,Johor,Johor,ATM,1,0,0,633,429,0.7,0,Help for sick family
3,4,2024-08-08 03:34:34.274405,28869.865366,Online Purchase,3,3,Penang,Penang,Mobile Wallet,1,1,0,993,2683,0.85,1,General Transfer
4,5,2024-09-12 03:34:34.274412,13094.451755,Investment,3,3,Sarawak,Kuala Lumpur,Counter,0,1,0,2459,2778,0.65,0,High Yield Investment


In [None]:
# prompt: download datasetas csv

import pandas as pd
import random
from datetime import datetime, timedelta
import json

def generate_scam_dataset(num_samples=1000):
    """
    Generate a synthetic dataset for scam detection training
    """
    # ... (rest of the code remains the same)

# Generate example dataset
df = generate_scam_dataset(1000)

# Save to CSV (only)
df.to_csv('malaysia_scam_dataset.csv', index=False)

# Print sample statistics (optional)
print("\nDataset Statistics:")
print(f"Total samples: {len(df)}")
print(f"Scam cases: {len(df[df['is_scam']])}")
print(f"Legitimate cases: {len(df[~df['is_scam']])}")
print("\nCategory distribution:")
print(df['category'].value_counts())

from google.colab import files
files.download('malaysia_scam_dataset.csv')

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_dummy_scam_dataset(num_records=1000):
    """
    Generate realistic dummy dataset for scam detection
    """
    np.random.seed(42)  # For reproducibility

    # Common scam types in Malaysia
    scam_types = [
        'Macau', 'Love', 'Investment', 'Online Purchase',
        'Phone Call', 'Job Offer', 'Loan', 'Impersonation'
    ]

    # Generate base data
    data = {
        'transaction_id': range(1, num_records + 1),
        'timestamp': [(datetime.now() - timedelta(days=np.random.randint(0, 365))) for _ in range(num_records)],
        'amount': np.random.exponential(scale=2000, size=num_records) * 100,  # Transaction amounts
        'scam_type': np.random.choice(scam_types, size=num_records, p=[0.2, 0.15, 0.15, 0.1, 0.1, 0.1, 0.1, 0.1])
    }

    # Create DataFrame
    df = pd.DataFrame(data)

    # Add more realistic features
    df['hour_of_day'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek

    # Generate location data (Malaysian states)
    states = ['Selangor', 'Kuala Lumpur', 'Penang', 'Johor', 'Sabah', 'Sarawak']
    df['sender_location'] = np.random.choice(states, size=num_records)
    df['receiver_location'] = np.random.choice(states, size=num_records)

    # Add transaction characteristics
    df['transaction_method'] = np.random.choice(
        ['Online Banking', 'Mobile Wallet', 'ATM', 'Counter'],
        size=num_records
    )

    # Add risk factors
    df['new_recipient'] = np.random.choice([0, 1], size=num_records, p=[0.7, 0.3])
    df['multiple_transactions'] = np.random.choice([0, 1], size=num_records, p=[0.8, 0.2])
    df['overseas_transaction'] = np.random.choice([0, 1], size=num_records, p=[0.9, 0.1])

    # Generate account ages (in days)
    df['sender_account_age'] = np.random.randint(1, 3650, size=num_records)  # Up to 10 years
    df['receiver_account_age'] = np.random.randint(1, 3650, size=num_records)

    # Add fraud label (is_scam)
    # Higher probability of scam for certain conditions
    def calculate_scam_probability(row):
        prob = 0.1  # Base probability

        # Increase probability based on risk factors
        if row['new_recipient']: prob += 0.2
        if row['multiple_transactions']: prob += 0.15
        if row['overseas_transaction']: prob += 0.25
        if row['hour_of_day'] in [23, 0, 1, 2, 3, 4]: prob += 0.2
        if row['receiver_account_age'] < 30: prob += 0.3
        if row['amount'] > 10000: prob += 0.2

        return min(prob, 0.9)  # Cap at 90% probability

    df['scam_probability'] = df.apply(calculate_scam_probability, axis=1)
    df['is_scam'] = df['scam_probability'].apply(lambda x: np.random.choice([0, 1], p=[1-x, x]))

    # Add transaction details
    df['transaction_description'] = df.apply(
        lambda row: generate_transaction_description(row),
        axis=1
    )

    return df

def generate_transaction_description(row):
    """Generate realistic transaction descriptions"""
    if row['scam_type'] == 'Macau':
        return np.random.choice([
            "URGENT: Legal Fee Payment",
            "Court Case Settlement",
            "Document Processing Fee"
        ])
    elif row['scam_type'] == 'Love':
        return np.random.choice([
            "Help for sick family",
            "Emergency medical fees",
            "Travel expense support"
        ])
    elif row['scam_type'] == 'Investment':
        return np.random.choice([
            "Crypto Investment Return",
            "High Yield Investment",
            "Quick Profit Scheme"
        ])
    else:
        return np.random.choice([
            "General Transfer",
            "Payment",
            "Personal Transfer"
        ])

# Generate example dataset
df = generate_dummy_scam_dataset(1000)

# Add some analysis functions
def analyze_scam_patterns(df):
    """Analyze patterns in the scam data"""
    analysis = {
        'scam_by_type': df[df['is_scam'] == 1]['scam_type'].value_counts(),
        'avg_amount_by_type': df.groupby('scam_type')['amount'].mean(),
        'scam_by_hour': df[df['is_scam'] == 1]['hour_of_day'].value_counts().sort_index(),
        'scam_by_location': df[df['is_scam'] == 1]['receiver_location'].value_counts()
    }
    return analysis

# Example usage
print("Sample of generated data:")
print(df.head())
print("\nScam patterns analysis:")
print(analyze_scam_patterns(df))

Sample of generated data:
   transaction_id                  timestamp         amount        scam_type  \
0               1 2024-08-12 03:34:34.274295  152109.449626    Impersonation   
1               2 2023-12-10 03:34:34.274383   10642.930291       Investment   
2               3 2024-02-26 03:34:34.274396   82076.720276             Love   
3               4 2024-08-08 03:34:34.274405   28869.865366  Online Purchase   
4               5 2024-09-12 03:34:34.274412   13094.451755       Investment   

   hour_of_day  day_of_week sender_location receiver_location  \
0            3            0           Johor      Kuala Lumpur   
1            3            6    Kuala Lumpur      Kuala Lumpur   
2            3            0           Johor             Johor   
3            3            3          Penang            Penang   
4            3            3         Sarawak      Kuala Lumpur   

  transaction_method  new_recipient  multiple_transactions  \
0                ATM              0     

In [3]:
import pandas as pd
# Assuming the code to generate the DataFrame 'df' is already executed.

# Save the DataFrame to a CSV file in your current working directory
df.to_csv('malaysia_scam_dataset.csv', index=False)

# Download the CSV file (Google Colab specific)
from google.colab import files
files.download('malaysia_scam_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>