In [87]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import pickle

In [None]:
# 讀取數據
account_info = pd.read_csv("./train/train_account_info.csv")
customer_info = pd.read_csv('./train/train_customer_info.csv')
account_transactions = pd.read_csv('./train//train_account_transactions.csv')
suspicious_accounts = pd.read_csv('./train//train_suspicious_accounts.csv')

In [89]:
account_customer_map = account_info[['account_number', 'customer_id']].drop_duplicates()

# 合併客戶信息
merged_base = pd.merge(
    account_customer_map,
    customer_info.drop('region_code', axis=1),
    on='customer_id',
    how='left'
)
print(merged_base)

# 合併帳戶信息
merged_base = pd.merge(
    merged_base,
    account_info.drop('customer_id', axis=1).drop('account_open_date', axis=1),
    on='account_number',
    how='left'
)

# 添加可疑帳戶標記
suspicious_accounts_set = set(suspicious_accounts['account_number'].unique())
merged_base['is_suspicious'] = merged_base['account_number'].apply(
    lambda x: 1 if x in suspicious_accounts_set else 0
)

print(f"基礎合併數據形狀: {merged_base.shape}")
print(merged_base.head())

      account_number customer_id  aut_amt  age  income_level
0           ACCT6068      ID5684   256930   61          25.0
1          ACCT11459     ID10838       65   57           NaN
2          ACCT15832     ID15012    14438   56         126.0
3          ACCT15612     ID14797    43872   46           NaN
4          ACCT18659     ID17677  2578166   72          25.0
...              ...         ...      ...  ...           ...
24964      ACCT18619     ID17645     2843   15          25.0
24965      ACCT20836     ID19721      142   20          25.0
24966      ACCT24372     ID23132      474   20          25.0
24967        ACCT501       ID463      175   20          25.0
24968      ACCT16300     ID15460    24430   36          25.0

[24969 rows x 5 columns]
基礎合併數據形狀: (24969, 8)
  account_number customer_id  aut_amt  age  income_level  is_unreachable  \
0       ACCT6068      ID5684   256930   61          25.0               0   
1      ACCT11459     ID10838       65   57           NaN             

In [90]:
incomeMapping = {
  "15.0": "0-300K",
  "30.0": "300K-610K",
  "45.0": "310K-600K",
  "70.0": "610K-800K",
  "90.0": "810K-1M",
  "125.0": "1.01M-1.5M",
  "225.0": "1.51M-3M",
  "400.0": "3.01M-5M",
  "750.0": "5.01M-10M",
  "1000.0": "Above 10M",
  "25.0": "0-500K",
  "126.0": "510K-2M",
  "350.0": "2.01M-5M",
  'nan': "missing"
}

In [91]:
# 初始化存儲結構
account_data = {}

# 首先填充帳戶和客戶信息
for _, row in merged_base.iterrows():
    account_number = row['account_number']
    row = row.to_dict()
    if row['income_level']:
        row['income_level'] = incomeMapping[str(row['income_level'])]
    else:
        row['income_level'] = "missing"
    # 創建帳戶數據字典
    account_data[account_number] = {
        'account_info': row,  # 包含所有帳戶和客戶信息
        'transaction_feature' : {},
        'transactions': []  # 初始化空交易列表
    }

In [92]:
channelMapping = {
  1: "Check Batch Processing",
  2: "Batch Salary Transfer",
  3: "AID System Securities Secondary Debit",
  4: "Bancs-Link Counter Transaction/PD-NET/VCCS Online Query Transaction",
  5: "Foreign Exchange Online Transaction",
  6: "Fund Online Transaction",
  7: "WBS",
  8: "Batch Account Opening",
  9: "BIZTALK-CR-CARD-TXN Credit Card Online Transaction including Card APP",
  10: "BATCH-TXN General Batch Processing",
  11: "FEDI-TNX FEDI",
  12: "Voice Online Transaction/VCCS Online Query Transaction",
  13: "Trickle Feed Transaction (API Securities Broker Batch Processing)",
  14: "Inward Remittance",
  15: "(Internet Banking",
  16: "Trickle Feed Transaction/API Batch Processing",
  17: "FEP-ATM-TXN (ATM)",
  18: "DIGITAL-BANK-TXN (Digital Banking)",
  19: "STORED-VALUE-PAYMENT (Value Storage Platform)"
}

codeMapping = {
  1: "Cash Deposit",
  2: "Transfer Deposit",
  3: "Transfer Withdrawal",
  4: "Cash Withdrawal",
  5: "Transfer Out Correction",
  6: "Deposit Interest",
  7: "Cash Deposit",
  8: "Check Deposit",
  9: "Check Transfer",
  10: "Transfer Deposit",
  11: "Media Transfer",
  12: "Transfer Deposit (Passbook-less Transfer)",
  13: "Authorized Transfer In",
  14: "Transfer Withdrawal",
  15: "Transfer Withdrawal",
  16: "Transfer Interest Withdrawal",
  17: "Authorized Transfer Out",
  18: "Cash Withdrawal",
  19: "Transfer to Issue Bank Check",
  20: "Media Transfer",
  21: "Cash Deposit Correction",
  22: "Transfer Out Correction",
  23: "Transfer Out Correction",
  24: "CD Deposit (ATM)",
  25: "CD Transfer In (ATM Transfer In)",
  26: "CD Transfer In",
  27: "CD Transfer Out (ATM Interbank Transfer Out)",
  28: "CD Transfer Out",
  29: "CD Withdrawal (ATM Cash Withdrawal)",
  30: "CD Withdrawal (Interbank Withdrawal)",
  31: "Payment Transfer Out",
  32: "Cross-border Purchase",
  33: "Consumption Debit",
  34: "Cross-border Return",
  35: "Payment Transfer Out",
  36: "CD Transfer Reversal",
  37: "CD Correction",
  38: "CD Correction",
  39: "SP Consumption Reversal",
  40: "Payment Transfer Reversal",
  41: "Media Transfer In",
  42: "Transfer Deposit",
  43: "Media Transfer Out",
  44: "Transfer Withdrawal",
  45: "Card Refund",
  46: "Late Fee",
  47: "Card Consumption",
  48: "Supplementary Health Insurance Premium",
  49: "Income Tax",
  50: "Income Tax",
  51: "Overdraft Interest",
  52: "Check Payment",
  53: "Check Cash Withdrawal",
  54: "Check Transfer"
}

In [None]:
# 統計需要處理的交易總數
total_transactions = len(account_transactions)
print(f"開始處理 {total_transactions} 筆交易...")

# 批次處理交易數據以避免內存問題
batch_size = 50000
num_batches = (total_transactions + batch_size - 1) // batch_size

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, total_transactions)
    
    print(f"處理交易批次 {i+1}/{num_batches} (行 {start_idx} 到 {end_idx})...")
    
    # 獲取當前批次的交易
    batch_transactions = account_transactions.iloc[start_idx:end_idx]
    
    # 遍歷此批次的交易並添加到相應的帳戶
    for _, txn in batch_transactions.iterrows():
        account_number = txn['account_number']
        
        # 確保帳戶存在於我們的數據結構中
        if account_number in account_data:
            # 將交易轉換為字典並添加到帳戶的交易列表
            txn = txn.to_dict()
            txn["transaction_channel"] = channelMapping[txn["transaction_channel"]]
            txn["transaction_code"] = codeMapping[txn["transaction_code"]]
            account_data[account_number]['transactions'].append(txn)
            if len(account_data[account_number]['transactions']) > 70:
                account_data[account_number]['transactions'] = sorted(
                    account_data[account_number]['transactions'],
                    key=lambda x: (x.get('transaction_date', 0), x.get('transaction_hour', 0))
                )[-70:] 


開始處理 206333 筆交易...
處理交易批次 1/5 (行 0 到 50000)...
處理交易批次 2/5 (行 50000 到 100000)...
處理交易批次 3/5 (行 100000 到 150000)...
處理交易批次 4/5 (行 150000 到 200000)...
處理交易批次 5/5 (行 200000 到 206333)...

分析字典式數據結構...
總帳戶數: 24969
有交易記錄的帳戶數: 24969
無交易記錄的帳戶數: 0
每個帳戶的平均交易數: 7.84
交易數量最多的帳戶: ACCT27189 有 70 筆交易

示例帳戶數據:

帳戶 1: ACCT6068
  客戶ID: ID5684
  是否為可疑帳戶: 否
  交易數量: 2
  交易示例:
    交易 1: 日期=18264, 金額=8531, 方向=出帳
    交易 2: 日期=18278, 金額=752, 方向=出帳

帳戶 2: ACCT11459
  客戶ID: ID10838
  是否為可疑帳戶: 否
  交易數量: 12
  交易示例:
    交易 1: 日期=18260, 金額=1554, 方向=入帳
    交易 2: 日期=18260, 金額=201, 方向=出帳
    交易 3: 日期=18260, 金額=605, 方向=出帳

帳戶 3: ACCT15832
  客戶ID: ID15012
  是否為可疑帳戶: 否
  交易數量: 6
  交易示例:
    交易 1: 日期=18263, 金額=13995, 方向=出帳
    交易 2: 日期=18270, 金額=10870, 方向=入帳
    交易 3: 日期=18277, 金額=34995, 方向=出帳

帳戶 4: ACCT15612
  客戶ID: ID14797
  是否為可疑帳戶: 否
  交易數量: 2
  交易示例:
    交易 1: 日期=18268, 金額=2018, 方向=出帳
    交易 2: 日期=18278, 金額=114, 方向=出帳

帳戶 5: ACCT18659
  客戶ID: ID17677
  是否為可疑帳戶: 否
  交易數量: 9
  交易示例:
    交易 1: 日期=18278, 金額=1919, 方向=出帳
   

In [94]:
# 方法2: 將結構轉換為JSON格式（更通用，但可能會很大）
# 注意: 這可能會產生非常大的文件，取決於數據量
try:
    with open('account_data_sample.json', 'w') as f:
        sample_data = {k: account_data[k] for k in list(account_data.keys())}
        json.dump(sample_data, f, indent=2)
except Exception as e:
    print(f"保存JSON時出錯: {e}")

# 7. 示例：如何使用這個數據結構進行分析
print("\n示例分析:")

# 示例1: 計算每個帳戶的交易統計資訊
print("示例1: 計算交易統計資訊")
account_stats = {}
sus_many_txn = []
non_many_txn = []

for account, data in account_data.items():
    txns = data['transactions']
    if not txns:
        continue
    
    # 交易金額統計
    amounts = [t.get('transaction_amount', 0) for t in txns]
    
    # 入帳和出帳交易
    incoming = [t for t in txns if t.get('transaction_direction') == 2]
    outgoing = [t for t in txns if t.get('transaction_direction') == 1]

    # 計算交易日期範圍
    transaction_dates = [t.get('transaction_date') for t in txns]
    min_date = min(transaction_dates) if transaction_dates else 0
    max_date = max(transaction_dates) if transaction_dates else 0
    date_range = max_date - min_date + 1 if min_date != max_date else 1  # 至少為1天
    
    # 計算日均交易筆數
    daily_avg_txn = len(txns) / date_range if date_range > 0 else 0
    if len(txns) > 80:
        if data['account_info'].get('is_suspicious'): sus_many_txn.append(account)
        else: non_many_txn.append(account)
    # 收集統計資訊
    account_stats[account] = {
        'txn_count': len(txns),
        'avg_amount': np.mean(amounts) if amounts else 0,
        'max_amount': max(amounts) if amounts else 0,
        'incoming_count': len(incoming),
        'outgoing_count': len(outgoing),
        'incoming_ratio': len(incoming) / len(txns) if txns else 0,
        'is_suspicious': data['account_info'].get('is_suspicious', 0)
    }
    
    data['transaction_feature'] = {
        'txn_count': len(txns),
        'total_txn_amount' : sum(amounts),
        'avg_txn_amount': np.mean(amounts) if amounts else 0,
        'max_txn_amount': max(amounts) if amounts else 0,
        'incoming_count': len(incoming),
        'outgoing_count': len(outgoing),
        'incoming_ratio': len(incoming) / len(txns) if txns else 0,
        'daily_avg_txn': daily_avg_txn,
        'txn_date_range': date_range
    }

# 將統計資訊轉換為DataFrame以便分析
stats_df = pd.DataFrame.from_dict(account_stats, orient='index')
print("交易統計資訊數據框頭部:")
print(stats_df.head())

# 比較可疑帳戶和非可疑帳戶的統計差異
suspicious_stats = stats_df[stats_df['is_suspicious'] == 1].mean()
non_suspicious_stats = stats_df[stats_df['is_suspicious'] == 0].mean()

print("\n可疑帳戶 vs 非可疑帳戶平均統計:")
comparison = pd.DataFrame({
    '可疑帳戶': suspicious_stats,
    '非可疑帳戶': non_suspicious_stats,
    '差異比例': suspicious_stats / non_suspicious_stats
})
print(comparison)

print("\n數據結構創建和分析完成!")


示例分析:
示例1: 計算交易統計資訊
交易統計資訊數據框頭部:
           txn_count    avg_amount  max_amount  incoming_count  \
ACCT6068           2   4641.500000        8531               2   
ACCT11459         12   1402.916667        5018               5   
ACCT15832          6  19318.166667       34995               3   
ACCT15612          2   1066.000000        2018               2   
ACCT18659          9  71438.555556       99995               8   

           outgoing_count  incoming_ratio  is_suspicious  
ACCT6068                0        1.000000              0  
ACCT11459               7        0.416667              0  
ACCT15832               3        0.500000              0  
ACCT15612               0        1.000000              0  
ACCT18659               1        0.888889              0  

可疑帳戶 vs 非可疑帳戶平均統計:
                         可疑帳戶         非可疑帳戶      差異比例
txn_count           20.890000      7.627702  2.738702
avg_amount       44773.082300  16825.735446  2.660988
max_amount      161210.060000  55

In [96]:
import json
import os
from datetime import datetime

# 2. 分離可疑帳戶和非可疑帳戶
print("\n分離可疑帳戶和非可疑帳戶...")

suspicious_accounts = {}
non_suspicious_accounts = {}
output_dir_sus = "split_accounts/suspicious"
output_dir_non = "split_accounts/non_suspicious"
os.makedirs(output_dir_sus, exist_ok=True)
os.makedirs(output_dir_non, exist_ok=True)

for account_number, data in account_data.items():
    if data['account_info'].get('is_suspicious', 0) == 1:
        suspicious_accounts[account_number] = data
        suspicious_file = os.path.join(output_dir_sus, f"suspicious_accounts_{account_number}.json")
        with open(suspicious_file, 'w', encoding='utf-8') as f:
            json.dump(account_data[account_number], f, indent=2, ensure_ascii=False)
    else:
        non_suspicious_accounts[account_number] = data
        non_suspicious_file = os.path.join(output_dir_non, f"non_suspicious_accounts_{account_number}.json")
        with open(non_suspicious_file, 'w', encoding='utf-8') as f:
            json.dump(account_data[account_number], f, indent=2, ensure_ascii=False)

print(f"可疑帳戶數量: {len(suspicious_accounts)}")
print(f"非可疑帳戶數量: {len(non_suspicious_accounts)}")

# 4. 生成時間戳（用於文件名）
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = "split_accounts"
# 5. 保存可疑帳戶JSON
suspicious_file = os.path.join(output_dir, f"suspicious_accounts.json")
with open(suspicious_file, 'w', encoding='utf-8') as f:
    json.dump(suspicious_accounts, f, indent=2, ensure_ascii=False)
print(f"可疑帳戶已保存為: {suspicious_file}")

# 6. 保存非可疑帳戶JSON
non_suspicious_file = os.path.join(output_dir, f"non_suspicious_accounts.json")
with open(non_suspicious_file, 'w', encoding='utf-8') as f:
    json.dump(non_suspicious_accounts, f, indent=2, ensure_ascii=False)
print(f"非可疑帳戶已保存為: {non_suspicious_file}")

# 7. 輸出兩種帳戶的基本統計比較
print("\n帳戶統計比較：")

# 計算交易數量
suspicious_txn_count = sum(len(data['transactions']) for data in suspicious_accounts.values())
non_suspicious_txn_count = sum(len(data['transactions']) for data in non_suspicious_accounts.values())

# 計算平均每個帳戶的交易數
suspicious_avg_txn = suspicious_txn_count / len(suspicious_accounts) if suspicious_accounts else 0
non_suspicious_avg_txn = non_suspicious_txn_count / len(non_suspicious_accounts) if non_suspicious_accounts else 0

print(f"可疑帳戶總交易數: {suspicious_txn_count}")
print(f"非可疑帳戶總交易數: {non_suspicious_txn_count}")
print(f"可疑帳戶平均交易數: {suspicious_avg_txn:.2f}")
print(f"非可疑帳戶平均交易數: {non_suspicious_avg_txn:.2f}")

print("\n分離完成!")


分離可疑帳戶和非可疑帳戶...
可疑帳戶數量: 400
非可疑帳戶數量: 24569
可疑帳戶已保存為: split_accounts/suspicious_accounts.json
非可疑帳戶已保存為: split_accounts/non_suspicious_accounts.json

帳戶統計比較：
可疑帳戶總交易數: 8356
非可疑帳戶總交易數: 187405
可疑帳戶平均交易數: 20.89
非可疑帳戶平均交易數: 7.63

分離完成!


In [1]:
import os
import random
import shutil

# 設定資料夾路徑
folder_path = './split_accounts/non_suspicious/'  # 例如：'non_suspicious'
output_folder = './split_accounts/non_suspicious_400/'   # 想存放隨機選出來的檔案，可以自訂

# 確保結果資料夾存在
os.makedirs(output_folder, exist_ok=True)

# 取得資料夾中所有 .json 檔案
all_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# 隨機挑選 400 個
selected_files = random.sample(all_files, 400)

# 複製到新資料夾
for file_name in selected_files:
    src_path = os.path.join(folder_path, file_name)
    dst_path = os.path.join(output_folder, file_name)
    shutil.copy(src_path, dst_path)

print(f"已隨機挑選並複製 {len(selected_files)} 個檔案到 {output_folder} 資料夾中！")


已隨機挑選並複製 400 個檔案到 ./split_accounts/non_suspicious_400/ 資料夾中！
