In [1]:
import os
import numpy as np
import pandas as pd

# 改成自己的路徑 data_origin: 訓練資料集, data_alert: 警示資料集, outputPath: 預處理後輸出路徑
originPath = 'C:\\school\\SchoolProgram\\NTUST_CSIE_DS\\DataSet\\acct_transaction.csv'
alertPath = 'C:\\school\\SchoolProgram\\NTUST_CSIE_DS\\DataSet\\acct_alert.csv'
outputPath = 'C:\\school\\SchoolProgram\\NTUST_CSIE_DS\\DataSet'

Load csv

In [2]:
df_origin = pd.read_csv(originPath)
df_alert = pd.read_csv(alertPath)
print("交易資料維度:", df_origin.shape)
print("警示標籤維度:", df_alert.shape)
print("(Finish) Load Dataset.")

#get first 100000 rows for testing
#df_origin = df_origin.head(100000)

交易資料維度: (4435890, 10)
警示標籤維度: (1004, 2)
(Finish) Load Dataset.


轉出、轉入金額資訊

In [3]:
# 1. 'total_send/recv_amt': total amount sent/received by each acct
send = df_origin.groupby('from_acct')['txn_amt'].sum().rename('total_send_amt')
recv = df_origin.groupby('to_acct')['txn_amt'].sum().rename('total_recv_amt')

# 2. max, min, avg txn_amt for each account
max_send = df_origin.groupby('from_acct')['txn_amt'].max().rename('max_send_amt')
min_send = df_origin.groupby('from_acct')['txn_amt'].min().rename('min_send_amt')
avg_send = df_origin.groupby('from_acct')['txn_amt'].mean().rename('avg_send_amt')
var_send = df_origin.groupby('from_acct')['txn_amt'].var().rename('var_send_amt')
std_send = df_origin.groupby('from_acct')['txn_amt'].std().rename('std_send_amt')

max_recv = df_origin.groupby('to_acct')['txn_amt'].max().rename('max_recv_amt')
min_recv = df_origin.groupby('to_acct')['txn_amt'].min().rename('min_recv_amt')
avg_recv = df_origin.groupby('to_acct')['txn_amt'].mean().rename('avg_recv_amt')
var_recv = df_origin.groupby('to_acct')['txn_amt'].var().rename('var_recv_amt')
std_recv = df_origin.groupby('to_acct')['txn_amt'].std().rename('std_recv_amt')

互相轉帳的交易占比

In [4]:
def getRepeatInfo(acct):
    """
    計算單一帳號 acct 的重複往來資訊：
    - repeatAccount: 與 acct 有「重複往來行為」的對象清單
      規則：
        1) acct -> X 的交易筆數 >= 2  -> 視為重複往來
        2) X -> acct 的交易筆數 >= 2  -> 視為重複往來
        3) acct 與 X 雙向各有至少 1 筆交易 -> 視為重複往來
      若以上任一成立，X 會被加入 repeat_list。
    - repeatRatio: 重複往來筆數 / acct 總交易筆數（含收、付）
    """
    A = acct

    # 取得 A 的所有往來帳號（送出對象 + 接收來源）
    send_targets = set(send_map.get(A, []))  # A → others
    recv_sources = set(recv_map.get(A, []))  # others → A
    partners = send_targets | recv_sources   # 合併

    repeat_list = []
    repeat_cnt = 0 # 重複往來交易總筆數 
    total_cnt = 0 # A 的所有交易總筆數

    # 計算 A 的所有交易筆數（含收、付）
    # A→X
    for X in send_targets:
        cnt_AX = pair_dict.get((A, X), 0)
        total_cnt += cnt_AX

    # X→A
    for X in recv_sources:
        cnt_XA = pair_dict.get((X, A), 0)
        total_cnt += cnt_XA

    # 計算 repeatAccount
    for X in partners:
        cnt_AX = pair_dict.get((A, X), 0) # A -> X 次數
        cnt_XA = pair_dict.get((X, A), 0) # X -> A 次數

        is_repeat = False

        # 規則 1：A→X >=2
        if cnt_AX >= 2:
            is_repeat = True

        # 規則 2：X→A >= 2
        if cnt_XA >= 2:
            is_repeat = True

        # 規則 3：互相往來（雙向 >=1）
        if cnt_AX >= 1 and cnt_XA >= 1:
            is_repeat = True

        if is_repeat:
            repeat_list.append(X)
            repeat_cnt += (cnt_AX + cnt_XA)

    repeat_ratio = repeat_cnt / total_cnt if total_cnt > 0 else 0

    return pd.Series({
        'repeatAccount': repeat_list,
        'repeatRatio': repeat_ratio
    })
    
# 計算每對 (from_acct, to_acct) 的交易次數（pair_counts），並轉為 dict 以便快速查詢
pair_counts = (
    df_origin.groupby(['from_acct', 'to_acct'])
    .size()
    .reset_index(name='cnt')
)
pair_dict = pair_counts.set_index(['from_acct', 'to_acct'])['cnt'].to_dict()

# 建立每個 acct 的往來對象清單（unique arrays）
send_map = df_origin.groupby('from_acct')['to_acct'].unique().to_dict()
recv_map = df_origin.groupby('to_acct')['from_acct'].unique().to_dict()

unique_accts = pd.DataFrame(
    pd.concat([df_origin['from_acct'], df_origin['to_acct']]).unique(),
    columns=['acct']
)

result = unique_accts['acct'].apply(getRepeatInfo)
result.index = unique_accts['acct']
repeatAccount = result['repeatAccount']
repeatRatio   = result['repeatRatio'].rename('repeatRatio')

使用交易通路種類

In [5]:
usedChannel_send = df_origin.groupby('from_acct')['channel_type'].apply(lambda x: sorted(x.dropna().unique())).rename('usedChannel_send') #get unique used channel types for senders
usedChannel_recv = df_origin.groupby('to_acct')['channel_type'].apply(lambda x: sorted(x.dropna().unique())).rename('usedChannel_recv') #get unique used channel types for receivers

#union usedChannel_send, usedChannel_recv
usedChannel = pd.concat([usedChannel_send, usedChannel_recv], axis=1).apply(
    lambda x: len(
        set(x['usedChannel_send'] if isinstance(x['usedChannel_send'], (list, np.ndarray)) else []) |
        set(x['usedChannel_recv'] if isinstance(x['usedChannel_recv'], (list, np.ndarray)) else [])
    ),
    axis=1
).rename('usedChannelTypes')

首次交易是否高轉帳金額

In [6]:
highAmountThreshold = df_origin['txn_amt'].quantile(0.8) # Define high amount threshold as 80th percentile

# 取得每個帳號的首筆送/收交易（依時間排序後取 first）
first_send = df_origin.sort_values(by=['from_acct', 'txn_time']).groupby('from_acct', as_index=False).first()
first_recv = df_origin.sort_values(by=['to_acct', 'txn_time']).groupby('to_acct', as_index=False).first()

# 轉成以 acct 為 index 的 Series，並判斷是否大於門檻
isFirstHighAmount_send = (first_send.set_index('from_acct')['txn_amt'] > highAmountThreshold).rename('isFirstHighAmount_send')
isFirstHighAmount_recv = (first_recv.set_index('to_acct')['txn_amt'] > highAmountThreshold).rename('isFirstHighAmount_recv')

# 補齊所有帳號（若帳號沒有首筆交易則為 False）
all_accts = pd.Index(pd.concat([df_origin['from_acct'], df_origin['to_acct']]).unique())
isFirstHighAmount_send = isFirstHighAmount_send.reindex(all_accts, fill_value=False)
isFirstHighAmount_recv = isFirstHighAmount_recv.reindex(all_accts, fill_value=False)

merge arrtibute

In [7]:
df_result = pd.concat([
    max_send, min_send, avg_send, var_send, std_send,
    max_recv, min_recv, avg_recv, var_recv, std_recv,
    send, recv, usedChannel,
    isFirstHighAmount_send, isFirstHighAmount_recv, repeatRatio
], axis=1).fillna(0).reset_index()
df_result.rename(columns={'index': 'acct'}, inplace=True)

export csv

In [8]:
df_result['label'] = df_result['acct'].isin(df_alert['acct']).astype(int)
df_result.to_csv(outputPath + "\\preprocessing_T3.csv", index=False)
print(f"(Finish) Save preprocessed data to {outputPath}\\preprocessing_T3.csv.")

(Finish) Save preprocessed data to C:\school\SchoolProgram\NTUST_CSIE_DS\DataSet\preprocessing_T3.csv.
