In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [None]:
# Parameter utama
num_users = 25  # Meningkatkan jumlah pengguna
num_transactions_per_user = 250  # Meningkatkan jumlah transaksi per pengguna
start_date = datetime(2022, 1, 1)  # Memperpanjang rentang waktu
end_date = datetime(2024, 5, 1)

In [None]:
# Daftar perusahaan/tempat untuk deskripsi
companies = [
    "Warung Padang", "Indomaret", "Alfamart", "Warkop 86", "GoFood",
    "GrabFood", "Shopee", "Tokopedia", "Lazada", "McD", "KFC",
    "Starbucks", "Chatime", "PLN", "PDAM", "Telkomsel", "XL Axiata",
    "Cinema XXI", "Uniqlo", "Zara", "Nike Store", "Adidas", "IKEA",
    "Ace Hardware", "Guardian", "Century", "Kimia Farma", "ATM BCA",
    "ATM BNI", "ATM Mandiri", "ATM BRI", "Traveloka", "Pegipegi",
    "Tiket.com", "Gojek", "Grab", "Uber", "Bluebird", "Apple Store",
    "Samsung Store", "ASUS Store", "Lenovo Store", "HP Store", "Dell Store"
]

In [None]:
# Definisi profil pengguna yang lebih beragam
user_profiles = {
    'student': {
        'income_range': (1000000, 3000000),
        'top_expenses': ['Makanan', 'Transportasi', 'Hiburan'],
        'spending_pattern': 'volatile',  # pola pengeluaran tidak stabil
    },
    'young_professional': {
        'income_range': (5000000, 12000000),
        'top_expenses': ['Makanan', 'Transportasi', 'Hiburan', 'Belanja'],
        'spending_pattern': 'weekend_spike',  # pola pengeluaran meningkat di akhir pekan
    },
    'family': {
        'income_range': (10000000, 20000000),
        'top_expenses': ['Makanan', 'Belanja', 'Tagihan', 'Transportasi'],
        'spending_pattern': 'month_end',  # pola pengeluaran meningkat di akhir bulan
    },
    'business_owner': {
        'income_range': (15000000, 50000000),
        'top_expenses': ['Belanja', 'Makanan', 'Hiburan', 'Lain-lain'],
        'spending_pattern': 'consistent',  # pola pengeluaran konsisten
    }
}

In [None]:
# Kategori pengeluaran dengan deskripsi umum dan rentang harga (yang telah diperluas)
expense_categories = {
    'Makanan': {
        'descriptions': [
            'Makan siang', 'Belanja bahan makanan', 'Makan malam', 'Sarapan', 'Kafe',
            'Restoran', 'Jajan', 'Makanan ringan', 'Minuman', 'Dessert',
            'Makanan cepat saji', 'Katering', 'Kue dan roti', 'Buah-buahan'
        ],
        'min_amount': 10000,
        'max_amount': 250000,
        'frequency': 0.3  # 30% dari semua transaksi
    },
    'Transportasi': {
        'descriptions': [
            'Bensin', 'Ojek online', 'Angkot', 'Bus', 'Taksi',
            'Tiket kereta', 'Tiket pesawat', 'Service kendaraan', 'Parkir', 'Tol',
            'Rental mobil', 'Cuci kendaraan', 'Asuransi kendaraan'
        ],
        'min_amount': 5000,
        'max_amount': 150000,
        'frequency': 0.15
    },
    'Belanja': {
        'descriptions': [
            'Baju', 'Sepatu', 'Elektronik', 'Aksesoris', 'Perabotan',
            'Alat rumah tangga', 'Kosmetik', 'Kebutuhan bayi', 'Alat tulis',
            'Buku', 'Peralatan olahraga', 'Gadget', 'Furniture', 'Perhiasan'
        ],
        'min_amount': 50000,
        'max_amount': 2000000,
        'frequency': 0.15
    },
    'Hiburan': {
        'descriptions': [
            'Nonton film', 'Game', 'Konser', 'Streaming', 'Hobi',
            'Liburan', 'Wisata', 'Karaoke', 'Gym', 'Olahraga',
            'Langganan digital', 'Beli buku', 'Festival', 'Museum', 'Theme park'
        ],
        'min_amount': 20000,
        'max_amount': 500000,
        'frequency': 0.1
    },
    'Tagihan': {
        'descriptions': [
            'Listrik', 'Air', 'Internet', 'Telepon', 'BPJS',
            'Sewa rumah/kos', 'Cicilan KPR', 'Asuransi', 'TV kabel', 'Langganan',
            'Pajak', 'Iuran bulanan', 'Service apartemen', 'Tagihan sekolah'
        ],
        'min_amount': 50000,
        'max_amount': 2000000,
        'frequency': 0.15
    },
    'Kesehatan': {
        'descriptions': [
            'Dokter', 'Obat', 'Vitamin', 'Rumah sakit', 'Konsultasi medis',
            'Terapi', 'Kacamata', 'Skincare', 'Check-up rutin', 'Vaksinasi',
            'Alat kesehatan', 'Perawatan gigi'
        ],
        'min_amount': 30000,
        'max_amount': 1000000,
        'frequency': 0.05
    },
    'Pendidikan': {
        'descriptions': [
            'Biaya kuliah', 'Kursus', 'Seminar', 'Buku pelajaran', 'Alat tulis',
            'Workshop', 'Sertifikasi', 'Les privat', 'Studi tour', 'Biaya ujian',
            'Proyek penelitian', 'Pelatihan online'
        ],
        'min_amount': 100000,
        'max_amount': 5000000,
        'frequency': 0.05
    },
    'Lain-lain': {
        'descriptions': [
            'Donasi', 'Hadiah', 'Biaya administrasi', 'Pajak', 'Denda',
            'Pinjaman', 'Sumbangan', 'Kebutuhan hewan peliharaan', 'Perbaikan rumah',
            'Jasa layanan', 'Biaya tak terduga', 'Isi ulang e-wallet'
        ],
        'min_amount': 10000,
        'max_amount': 1000000,
        'frequency': 0.05
    }
}

In [None]:
# Kategori pemasukan dengan deskripsi dan rentang
income_categories = {
    'Gaji': {
        'descriptions': ['Gaji bulanan', 'Bonus', 'THR', 'Insentif', 'Lembur'],
        'min_amount': 3000000,
        'max_amount': 20000000,
        'frequency': 0.5
    },
    'Freelance': {
        'descriptions': [
            'Proyek freelance', 'Konsultasi', 'Jasa desain', 'Mengajar', 'Terjemahan',
            'Content creation', 'Programming', 'Editing', 'Jasa foto', 'Writing'
        ],
        'min_amount': 500000,
        'max_amount': 5000000,
        'frequency': 0.25
    },
    'Investasi': {
        'descriptions': [
            'Dividen', 'Bunga deposito', 'Keuntungan saham', 'P2P Lending',
            'Hasil sewa properti', 'Reksa dana', 'Crypto', 'Forex', 'Obligasi'
        ],
        'min_amount': 100000,
        'max_amount': 5000000,
        'frequency': 0.15
    },
    'Bisnis': {
        'descriptions': [
            'Keuntungan bisnis', 'Penjualan produk', 'Komisi afiliasi', 'Royalti',
            'Pendapatan dari cabang', 'Hasil franchise', 'Penjualan online'
        ],
        'min_amount': 1000000,
        'max_amount': 10000000,
        'frequency': 0.1
    }
}

In [None]:
# Metode pembayaran
payment_methods = ['Tunai', 'Kartu Kredit', 'Kartu Debit', 'QRIS', 'e-Wallet', 'Transfer Bank', 'Paylater', 'Cicilan']

# Daftar kota untuk lokasi transaksi
cities = [
    "Jakarta", "Surabaya", "Bandung", "Yogyakarta", "Medan", "Makassar",
    "Semarang", "Palembang", "Balikpapan", "Malang", "Denpasar", "Pontianak",
    "Solo", "Manado", "Padang", "Pekanbaru", "Banjarmasin", "Batam",
    "Cirebon", "Tasikmalaya", "Bogor", "Depok", "Tangerang", "Bekasi"
]

In [None]:
# Tanggal lebaran untuk menambahkan pola musiman
lebaran_dates = [
    pd.Timestamp('2022-05-02'), pd.Timestamp('2022-05-03'),
    pd.Timestamp('2023-04-22'), pd.Timestamp('2023-04-23'),
    pd.Timestamp('2024-04-10'), pd.Timestamp('2024-04-11')
]

In [None]:
# Tanggal-tanggal penting lainnya
important_dates = {
    'Tahun Baru': [pd.Timestamp(f'{year}-01-01') for year in range(2022, 2025)],
    'Natal': [pd.Timestamp(f'{year}-12-25') for year in range(2021, 2024)],
    'Black Friday': [
        pd.Timestamp('2021-11-26'), pd.Timestamp('2022-11-25'),
        pd.Timestamp('2023-11-24')
    ],
    'Harbolnas': [
        pd.Timestamp('2021-12-12'), pd.Timestamp('2022-12-12'),
        pd.Timestamp('2023-12-12')
    ],
    'Liburan Sekolah': [
        pd.date_range(start='2022-06-01', end='2022-06-30'),
        pd.date_range(start='2022-12-15', end='2023-01-05'),
        pd.date_range(start='2023-06-01', end='2023-06-30'),
        pd.date_range(start='2023-12-15', end='2024-01-05')
    ]
}

In [None]:
def generate_enhanced_transactions(num_users, num_transactions_per_user, start_date, end_date):
    transactions = []
    transaction_id = 1

    # Buat distribusi untuk setiap tipe profil pengguna
    profile_distribution = ['student'] * 6 + ['young_professional'] * 8 + ['family'] * 6 + ['business_owner'] * 5

    # Generate user_id dan profil untuk setiap pengguna
    user_profile_mapping = {}
    for user_id in range(1, num_users + 1):
        profile = random.choice(profile_distribution)
        user_profile_mapping[user_id] = profile

    # Generate transaksi untuk setiap pengguna
    for user_id, profile in user_profile_mapping.items():
        # Ambil detail profil
        profile_data = user_profiles[profile]

        # Tentukan gaji bulanan sesuai profil
        if profile == 'student':
            monthly_income = random.uniform(1000000, 3000000)
        elif profile == 'young_professional':
            monthly_income = random.uniform(5000000, 10000000)
        elif profile == 'family':
            monthly_income = random.uniform(10000000, 20000000)
        else:  # business_owner
            monthly_income = random.uniform(20000000, 50000000)

        # Buat transaksi untuk pengguna ini
        for _ in range(num_transactions_per_user):
            # Tentukan tanggal transaksi secara random
            days_range = (end_date - start_date).days
            random_days = random.randint(0, days_range)
            transaction_date = start_date + timedelta(days=random_days)

            # Tentukan tipe transaksi (masuk/keluar)
            # Student: 15% pemasukan, 85% pengeluaran
            # Young Professional: 20% pemasukan, 80% pengeluaran
            # Family: 20% pemasukan, 80% pengeluaran
            # Business Owner: 25% pemasukan, 75% pengeluaran
            if profile == 'student':
                p_income = 0.15
            elif profile == 'business_owner':
                p_income = 0.25
            else:
                p_income = 0.2

            transaction_type = np.random.choice(['masuk', 'keluar'], p=[p_income, 1-p_income])

            # Pilih kategori dan detail transaksi
            if transaction_type == 'keluar':
                # Prioritaskan kategori dari top_expenses profil
                if random.random() < 0.7:  # 70% memilih dari top_expenses
                    category = random.choice(profile_data['top_expenses'])
                else:
                    # Pilih kategori berdasarkan frekuensi
                    categories = list(expense_categories.keys())
                    frequencies = [expense_categories[cat]['frequency'] for cat in categories]
                    # Normalisasi frekuensi agar jumlahnya 1
                    sum_freq = sum(frequencies)
                    frequencies = [f/sum_freq for f in frequencies]
                    category = np.random.choice(categories, p=frequencies)

                # Ambil detail kategori
                cat_details = expense_categories[category]
                description = random.choice(cat_details['descriptions'])

                # Sesuaikan rentang nominal berdasarkan profil pengguna
                if profile == 'student':
                    min_amt = cat_details['min_amount']
                    max_amt = cat_details['min_amount'] + (cat_details['max_amount'] - cat_details['min_amount']) * 0.4
                elif profile == 'business_owner':
                    min_amt = cat_details['min_amount'] + (cat_details['max_amount'] - cat_details['min_amount']) * 0.5
                    max_amt = cat_details['max_amount'] * 1.2
                elif profile == 'family':
                    min_amt = cat_details['min_amount'] + (cat_details['max_amount'] - cat_details['min_amount']) * 0.3
                    max_amt = cat_details['max_amount']
                else:  # young_professional
                    min_amt = cat_details['min_amount']
                    max_amt = cat_details['max_amount']

                # Aplikasikan pola pengeluaran sesuai profil
                spending_pattern = profile_data['spending_pattern']
                multiplier = 1.0

                # Weekend spike pattern
                if spending_pattern == 'weekend_spike' and transaction_date.weekday() >= 5:  # Sat-Sun
                    multiplier *= random.uniform(1.3, 2.0)

                # Month end pattern
                if spending_pattern == 'month_end' and transaction_date.day >= 25:
                    multiplier *= random.uniform(1.3, 1.8)

                # Volatile pattern (untuk student)
                if spending_pattern == 'volatile':
                    multiplier *= random.uniform(0.5, 2.2)

                # Sesuaikan untuk tanggal khusus
                # Lebaran
                for lebaran_date in lebaran_dates:
                    if abs((transaction_date - lebaran_date).days) <= 7:
                        if category in ['Belanja', 'Makanan']:
                            multiplier *= random.uniform(1.5, 3.0)
                            description = description + " (Belanja Lebaran)"

                # Tahun Baru
                for new_year in important_dates['Tahun Baru']:
                    if abs((transaction_date - new_year).days) <= 5:
                        if category in ['Hiburan', 'Makanan']:
                            multiplier *= random.uniform(1.5, 2.5)
                            description = description + " (Tahun Baru)"

                # Natal
                for christmas in important_dates['Natal']:
                    if abs((transaction_date - christmas).days) <= 7:
                        if category in ['Belanja', 'Hiburan']:
                            multiplier *= random.uniform(1.5, 2.5)
                            description = description + " (Natal)"

                # Harbolnas
                for harbolnas in important_dates['Harbolnas']:
                    if abs((transaction_date - harbolnas).days) <= 1:
                        if category == 'Belanja':
                            multiplier *= random.uniform(1.5, 3.0)
                            description = description + " (Harbolnas 12.12)"

                # Black Friday
                for black_friday in important_dates['Black Friday']:
                    if abs((transaction_date - black_friday).days) <= 1:
                        if category == 'Belanja':
                            multiplier *= random.uniform(1.5, 3.0)
                            description = description + " (Black Friday)"

                # Apply final multiplier
                amount = round(random.uniform(min_amt, max_amt) * multiplier, -2)  # Bulatkan ke 100 terdekat

                # Untuk tagihan, buat pola bulanan
                if category == 'Tagihan' and description in ['Listrik', 'Air', 'Internet', 'Sewa rumah/kos']:
                    # Tentukan nilai dasar untuk tagihan ini
                    if description == 'Listrik':
                        base_amount = round(random.uniform(200000, 700000), -3)
                    elif description == 'Air':
                        base_amount = round(random.uniform(50000, 150000), -3)
                    elif description == 'Internet':
                        base_amount = round(random.uniform(300000, 500000), -3)
                    elif description == 'Sewa rumah/kos':
                        if profile == 'student':
                            base_amount = round(random.uniform(800000, 1500000), -4)
                        elif profile == 'young_professional':
                            base_amount = round(random.uniform(1500000, 3000000), -4)
                        else:
                            base_amount = round(random.uniform(3000000, 7000000), -4)

                    # Tambahkan variasi kecil (± 10%)
                    amount = round(base_amount * random.uniform(0.9, 1.1), -3)

                # Tambahkan nama perusahaan/penyedia ke deskripsi
                company = random.choice(companies)
                description = f"{description} di {company}"

                # Tambahkan lokasi transaksi
                if category in ['Makanan', 'Belanja', 'Hiburan']:
                    location = random.choice(cities)
                else:
                    location = random.choice(cities[:8])  # Kota-kota besar untuk tagihan
            else:
                # Logika untuk pemasukan
                # Di tanggal awal bulan, kemungkinan besar gaji
                if 1 <= transaction_date.day <= 5 and random.random() < 0.8:
                    category = 'Gaji'
                    amount = monthly_income
                    description = f"Gaji bulanan - {profile}"
                    location = random.choice(cities[:8])  # Kota besar
                else:
                    # Pilih kategori berdasarkan frekuensi
                    categories = list(income_categories.keys())
                    frequencies = [income_categories[cat]['frequency'] for cat in categories]
                    # Normalisasi frekuensi
                    sum_freq = sum(frequencies)
                    frequencies = [f/sum_freq for f in frequencies]
                    category = np.random.choice(categories, p=frequencies)

                    cat_details = income_categories[category]
                    description = random.choice(cat_details['descriptions'])

                    # Sesuaikan jumlah berdasarkan profil
                    if profile == 'business_owner' and category in ['Freelance', 'Investasi', 'Bisnis']:
                        min_amt = cat_details['min_amount'] * 1.5
                        max_amt = cat_details['max_amount'] * 1.5
                    else:
                        min_amt = cat_details['min_amount']
                        max_amt = cat_details['max_amount']

                    amount = round(random.uniform(min_amt, max_amt), -3)
                    location = random.choice(cities)

            # Pilih metode pembayaran yang sesuai
            if transaction_type == 'masuk':
                payment_method = random.choice(['Transfer Bank', 'Tunai'])
            else:
                # Student lebih sering pakai e-Wallet dan Tunai
                if profile == 'student':
                    payment_method = random.choice(['Tunai', 'e-Wallet', 'e-Wallet', 'Transfer Bank', 'QRIS'])
                # Business owner lebih sering pakai kartu kredit
                elif profile == 'business_owner':
                    payment_method = random.choice(['Kartu Kredit', 'Kartu Kredit', 'Transfer Bank', 'QRIS', 'e-Wallet', 'Tunai'])
                # Young professional lebih variatif
                elif profile == 'young_professional':
                    payment_method = random.choice(payment_methods)
                # Family lebih sering transfer dan debit
                else:
                    payment_method = random.choice(['Kartu Debit', 'Transfer Bank', 'Tunai', 'e-Wallet', 'QRIS'])

            # Buat record transaksi
            transaction = {
                'id_transaksi': f'TRX{transaction_id:06}',
                'tanggal': transaction_date.strftime('%Y-%m-%d'),
                'waktu': f"{random.randint(6, 22):02d}:{random.randint(0, 59):02d}",
                'nominal': amount,
                'tipe': transaction_type,
                'kategori': category,
                'deskripsi': description,
                'metode_pembayaran': payment_method,
                'lokasi': location,
                'user_id': f'U{user_id:03}',
                'profil': profile
            }

            # Tambahkan rating untuk transaksi keluar (selain tagihan)
            if transaction_type == 'keluar' and category != 'Tagihan':
                # Rating rendah untuk transaksi dengan nilai tinggi
                if amount > cat_details['max_amount'] * 0.8:
                    transaction['rating'] = random.randint(1, 3)
                else:
                    transaction['rating'] = random.randint(3, 5)

            transactions.append(transaction)
            transaction_id += 1

    # Tambahkan outlier untuk deteksi anomali
    transactions = add_enhanced_outliers(transactions, percentage=0.08)

    # Konversi ke DataFrame
    df = pd.DataFrame(transactions)
    return df


In [None]:
def add_enhanced_outliers(transactions, percentage=0.08):
    """Menambahkan outlier ke dataset transaksi"""
    num_outliers = int(len(transactions) * percentage)

    # 1. Outlier nilai tinggi
    for _ in range(int(num_outliers * 0.4)):
        idx = random.randint(0, len(transactions) - 1)
        transaction = transactions[idx].copy()

        if transaction['tipe'] == 'keluar':
            # Buat outlier dengan nilai sangat tinggi untuk kategori tersebut
            if transaction['kategori'] in expense_categories:
                cat_details = expense_categories[transaction['kategori']]
                outlier_amount = cat_details['max_amount'] * random.uniform(3.0, 8.0)
                transaction['nominal'] = round(outlier_amount, -2)
                transaction['deskripsi'] += " (Unusual expense)"
                transaction['id_transaksi'] = f"OUT{random.randint(10000, 99999)}"

                # Tambahkan ke dataset
                transactions.append(transaction)

    # 2. Transaksi duplikat (transaksi ganda dalam waktu singkat)
    for _ in range(int(num_outliers * 0.3)):
        idx = random.randint(0, len(transactions) - 1)
        transaction = transactions[idx].copy()

        # Kecuali 'id_transaksi', sama persis dengan transaksi asli
        transaction['id_transaksi'] = f"DUP{random.randint(10000, 99999)}"

        # Tambahkan ke dataset
        transactions.append(transaction)

    # 3. Transaksi mencurigakan (lokasi berbeda, waktu aneh)
    for _ in range(int(num_outliers * 0.3)):
        # Pilih user secara random
        all_user_ids = list(set([t['user_id'] for t in transactions]))
        user_id = random.choice(all_user_ids)
        user_transactions = [t for t in transactions if t['user_id'] == user_id]

        if user_transactions:
            # Pilih transaksi random sebagai template
            template = random.choice(user_transactions)
            suspicious = template.copy()

            # Buat transaksi mencurigakan
            suspicious['id_transaksi'] = f"SUS{random.randint(10000, 99999)}"
            suspicious['nominal'] = template['nominal'] * random.uniform(5, 10)
            suspicious['deskripsi'] = "Transaksi tidak dikenal " + template['deskripsi']

            # Ubah lokasi ke kota yang jauh
            user_cities = [t['lokasi'] for t in user_transactions if 'lokasi' in t]
            unusual_cities = [city for city in cities if city not in user_cities]
            if unusual_cities:
                suspicious['lokasi'] = random.choice(unusual_cities)

            # Waktu yang tidak biasa
            suspicious['waktu'] = f"{random.randint(0, 5):02d}:{random.randint(0, 59):02d}"

            # Tambahkan tanggal yang berdekatan (1-3 hari setelah transaksi asli)
            transaction_date = datetime.strptime(template['tanggal'], '%Y-%m-%d')
            days_to_add = random.randint(1, 3)
            suspicious['tanggal'] = (transaction_date + timedelta(days=days_to_add)).strftime('%Y-%m-%d')

            # Tambahkan ke dataset
            transactions.append(suspicious)

    return transactions

In [None]:
# Generate dataset
enhanced_transactions_df = generate_enhanced_transactions(num_users, num_transactions_per_user, start_date, end_date)

In [None]:
# Tambahkan pola dan variasi tambahan
# 1. Tambahkan tren inflasi (kenaikan harga secara bertahap)
enhanced_transactions_df['tanggal'] = pd.to_datetime(enhanced_transactions_df['tanggal'])
enhanced_transactions_df['year'] = enhanced_transactions_df['tanggal'].dt.year
enhanced_transactions_df['month'] = enhanced_transactions_df['tanggal'].dt.month

# Aplikasikan inflasi per tahun
for year in range(2022, 2025):
    inflation_factor = 1.0
    if year == 2022:
        inflation_factor = 1.0
    elif year == 2023:
        inflation_factor = 1.05  # 5% inflasi
    else:  # 2024
        inflation_factor = 1.08  # 8% total inflasi

    year_indices = enhanced_transactions_df[enhanced_transactions_df['year'] == year].index
    enhanced_transactions_df.loc[year_indices, 'nominal'] = enhanced_transactions_df.loc[year_indices, 'nominal'] * inflation_factor

# Hapus kolom bantuan
enhanced_transactions_df = enhanced_transactions_df.drop(columns=['year', 'month'])

# 2. Tambahkan pola saat gajian (pengeluaran meningkat setelah tanggal 25-30)
enhanced_transactions_df['day'] = enhanced_transactions_df['tanggal'].dt.day
payday_indices = enhanced_transactions_df[
    (enhanced_transactions_df['day'] >= 25) &
    (enhanced_transactions_df['day'] <= 30) &
    (enhanced_transactions_df['tipe'] == 'keluar') &
    (enhanced_transactions_df['kategori'].isin(['Makanan', 'Hiburan', 'Belanja']))
].index
enhanced_transactions_df.loc[payday_indices, 'nominal'] = enhanced_transactions_df.loc[payday_indices, 'nominal'] * 1.3

# 3. Tambahkan pola weekend
enhanced_transactions_df['weekday'] = enhanced_transactions_df['tanggal'].dt.dayofweek
weekend_indices = enhanced_transactions_df[
    (enhanced_transactions_df['weekday'] >= 5) &
    (enhanced_transactions_df['tipe'] == 'keluar') &
    (enhanced_transactions_df['kategori'].isin(['Makanan', 'Hiburan']))
].index
enhanced_transactions_df.loc[weekend_indices, 'nominal'] = enhanced_transactions_df.loc[weekend_indices, 'nominal'] * 1.2

# Bersihkan kolom bantuan
enhanced_transactions_df = enhanced_transactions_df.drop(columns=['day', 'weekday'])
enhanced_transactions_df['tanggal'] = enhanced_transactions_df['tanggal'].dt.strftime('%Y-%m-%d')

# Tambahkan flag untuk data yang mungkin memerlukan perhatian (untuk model deteksi anomali)
def flag_potential_anomaly(row):
    if 'Unusual expense' in row['deskripsi'] or 'Transaksi tidak dikenal' in row['deskripsi']:
        return 1

    # Flag transaksi dengan nominal sangat tinggi untuk kategori tersebut
    if row['tipe'] == 'keluar' and row['kategori'] in expense_categories:
        cat_details = expense_categories[row['kategori']]
        if row['nominal'] > cat_details['max_amount'] * 2:
            return 1

    return 0

enhanced_transactions_df['anomaly_flag'] = enhanced_transactions_df.apply(flag_potential_anomaly, axis=1)


In [None]:
# Tulis ke CSV
enhanced_transactions_df.to_csv('data_transaksi_advanced.csv', index=False)

print(f"Dataset sintetis dengan {len(enhanced_transactions_df)} transaksi berhasil dibuat!")


Dataset sintetis dengan 6703 transaksi berhasil dibuat!


In [None]:
# Tampilkan beberapa statistik dasar
print("\nStatistik dataset:")
print(f"Jumlah transaksi: {len(enhanced_transactions_df)}")
print(f"Jumlah pengguna unik: {enhanced_transactions_df['user_id'].nunique()}")
print(f"Rentang tanggal: {enhanced_transactions_df['tanggal'].min()} sampai {enhanced_transactions_df['tanggal'].max()}")
print(f"Distribusi profil pengguna: {enhanced_transactions_df['profil'].value_counts().to_dict()}")
print(f"Distribusi kategori: {enhanced_transactions_df['kategori'].value_counts().to_dict()}")
print(f"Distribusi tipe transaksi: {enhanced_transactions_df['tipe'].value_counts().to_dict()}")
print(f"Rata-rata nominal: {enhanced_transactions_df['nominal'].mean():.2f}")
print(f"Jumlah transaksi anomali: {enhanced_transactions_df['anomaly_flag'].sum()}")

# Tampilkan beberapa contoh data
print("\nContoh data transaksi:")
print(enhanced_transactions_df.head())


Statistik dataset:
Jumlah transaksi: 6703
Jumlah pengguna unik: 25
Rentang tanggal: 2022-01-01 sampai 2024-05-03
Distribusi profil pengguna: {'family': 2146, 'young_professional': 1872, 'business_owner': 1617, 'student': 1068}
Distribusi kategori: {'Makanan': 1416, 'Transportasi': 1029, np.str_('Belanja'): 1026, 'Hiburan': 848, 'Gaji': 778, np.str_('Tagihan'): 549, np.str_('Freelance'): 305, np.str_('Lain-lain'): 300, np.str_('Investasi'): 185, np.str_('Bisnis'): 120, np.str_('Pendidikan'): 80, np.str_('Kesehatan'): 67}
Distribusi tipe transaksi: {np.str_('keluar'): 5315, np.str_('masuk'): 1388}
Rata-rata nominal: 3062448.87
Jumlah transaksi anomali: 531

Contoh data transaksi:
  id_transaksi     tanggal  waktu       nominal    tipe   kategori  \
0    TRX000001  2022-04-10  15:38  2.067600e+05  keluar    Makanan   
1    TRX000002  2023-12-02  11:42  1.261342e+06   masuk       Gaji   
2    TRX000003  2023-10-10  16:41  3.035182e+06   masuk  Freelance   
3    TRX000004  2023-10-06  18:4