In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

## Load data

In [20]:
def load_data():
    """載入原始數據"""
    print("載入原始數據...")
    # 讀取數據
    account_info = pd.read_csv('./Train/train_account_info.csv')
    customer_info = pd.read_csv('./Train/train_customer_info.csv')
    account_transactions = pd.read_csv('./Train/train_account_transactions.csv')
    suspicious_accounts = pd.read_csv('./Train/train_suspicious_accounts.csv')
    
    print(f"Account Info 形狀: {account_info.shape}")
    print(f"Account Transactions 形狀: {account_transactions.shape}")
    print(f"Customer Info 形狀: {customer_info.shape}")
    print(f"Suspicious Accounts 形狀: {suspicious_accounts.shape}")
    
    return account_info, customer_info, account_transactions, suspicious_accounts

In [21]:
def merge_basic_data(account_info, customer_info):
    """合併基本帳戶和客戶信息"""
    print("合併基本數據...")
    # 創建客戶-帳戶的對應關係
    account_customer_map = account_info[['account_number', 'customer_id']].drop_duplicates()
    print(f"客戶-帳戶映射表形狀: {account_customer_map.shape}")
    
    # 合併客戶資訊
    merged_data = pd.merge(
        account_customer_map, 
        customer_info, 
        on='customer_id', 
        how='left'
    )
    print(f"合併客戶資訊後的形狀: {merged_data.shape}")
    
    # 合併帳戶資訊
    merged_data = pd.merge(
        merged_data,
        account_info.drop('customer_id', axis=1),
        on='account_number',
        how='left'
    )
    print(f"合併帳戶資訊後的形狀: {merged_data.shape}")
    
    return merged_data

## Feature Engineering

In [22]:
def create_transaction_features(account_transactions):
    """從交易數據中提取特徵"""
    print("從交易數據提取特徵...")
    
    # 1. 基本交易統計特徵
    transaction_count = account_transactions.groupby('account_number').size().reset_index(name='transaction_count')
    transaction_amount_mean = account_transactions.groupby('account_number')['transaction_amount'].mean().reset_index(name='avg_transaction_amount')
    transaction_amount_sum = account_transactions.groupby('account_number')['transaction_amount'].sum().reset_index(name='total_transaction_amount')
    transaction_amount_std = account_transactions.groupby('account_number')['transaction_amount'].std().reset_index(name='std_transaction_amount')
    transaction_amount_max = account_transactions.groupby('account_number')['transaction_amount'].max().reset_index(name='max_transaction_amount')
    
    # 2. 交易方向特徵
    transaction_direction = account_transactions.groupby('account_number')['transaction_direction'].mean().reset_index(name='income_ratio')
    
    # 3. 銀行使用行為特徵
    mobile_banking_usage = account_transactions.groupby('account_number')['mobile_banking_check_count'].sum().reset_index(name='mobile_banking_usage')
    ebanking_usage = account_transactions.groupby('account_number')['ebanking_check_count'].sum().reset_index(name='ebanking_usage')
    same_ip_ratio = account_transactions.groupby('account_number')['is_same_ip'].mean().reset_index(name='same_ip_ratio')
    same_device_ratio = account_transactions.groupby('account_number')['is_same_device'].mean().reset_index(name='same_device_ratio')
    
    # 4. 特殊交易特徵
    # 與特定帳號的交易
    account_bank_txn = account_transactions[account_transactions['counterparty_account'] == 'ACCT31429']
    bank_txn_count = account_bank_txn.groupby('account_number').size().reset_index(name='bank_txn_count')
    bank_txn_avg = account_bank_txn.groupby('account_number')['transaction_amount'].mean().reset_index(name='avg_bank_txn_amount')
    
    # 與特定客戶ID的交易
    account_virtual_txn = account_transactions[account_transactions['counterparty_customer_id'] == 'ID99999']
    virtual_txn_count = account_virtual_txn.groupby('account_number').size().reset_index(name='virtual_txn_count')
    virtual_txn_avg = account_virtual_txn.groupby('account_number')['transaction_amount'].mean().reset_index(name='avg_virtual_txn_amount')
    
    # 5. 交易對方多樣性
    counterparty_diversity = account_transactions.groupby('account_number')['counterparty_account'].nunique().reset_index(name='counterparty_diversity')
    
    # 6. 交易渠道和代碼使用次數
    channel_usage = pd.get_dummies(account_transactions['transaction_channel']).groupby(account_transactions['account_number']).sum()
    channel_usage.columns = [f'channel_{col}_count' for col in channel_usage.columns]
    
    code_usage = pd.get_dummies(account_transactions['transaction_code']).groupby(account_transactions['account_number']).sum()
    code_usage.columns = [f'code_{col}_count' for col in code_usage.columns]
    
    # 合併所有交易特徵
    transaction_features = transaction_count
    features_to_merge = [
        transaction_amount_mean, transaction_amount_sum, transaction_amount_std,
        transaction_amount_max, transaction_direction, mobile_banking_usage,
        ebanking_usage, same_ip_ratio, same_device_ratio, bank_txn_count,
        bank_txn_avg, virtual_txn_count, virtual_txn_avg, counterparty_diversity
    ]
    
    for feature in features_to_merge:
        transaction_features = pd.merge(
            transaction_features,
            feature,
            on='account_number',
            how='left'
        )
    
    # 合併渠道和代碼使用特徵
    transaction_features = pd.merge(
        transaction_features,
        channel_usage.reset_index(),
        on='account_number',
        how='left'
    )
    
    transaction_features = pd.merge(
        transaction_features,
        code_usage.reset_index(),
        on='account_number',
        how='left'
    )
    
    print(f"交易特徵的形狀: {transaction_features.shape}")
    return transaction_features

In [23]:
def merge_transaction_features(merged_data, transaction_features):
    """將交易特徵合併到主數據集"""
    print("合併交易特徵到主數據集...")
    merged_data_with_txn = pd.merge(
        merged_data,
        transaction_features,
        on='account_number',
        how='left'
    )
    
    # 填充缺失值
    numeric_txn_cols = transaction_features.columns.drop('account_number')
    for col in numeric_txn_cols:
        merged_data_with_txn[col] = merged_data_with_txn[col].fillna(0)
    
    print(f"合併後的數據形狀: {merged_data_with_txn.shape}")
    return merged_data_with_txn


In [24]:
def add_suspicious_flag(merged_data_with_txn, suspicious_accounts):
    """添加可疑帳戶標記"""
    print("添加可疑帳戶標記...")
    suspicious_accounts_list = suspicious_accounts['account_number'].tolist()
    
    # 添加is_suspicious標記
    merged_data_with_txn['is_suspicious'] = merged_data_with_txn['account_number'].apply(
        lambda x: 1 if x in suspicious_accounts_list else 0
    )
    
    # 顯示可疑帳戶數量統計
    suspicious_count = merged_data_with_txn['is_suspicious'].sum()
    total_accounts = len(merged_data_with_txn)
    print(f"可疑帳戶數量: {suspicious_count} ({suspicious_count/total_accounts:.2%})")
    print(f"非可疑帳戶數量: {total_accounts - suspicious_count} ({(total_accounts - suspicious_count)/total_accounts:.2%})")
    
    return merged_data_with_txn

In [25]:
def create_advanced_features(df):
    """創建進階特徵"""
    print("創建進階特徵...")
    
    # 1. 時間相關特徵
    if 'account_open_date' in df.columns:
        # 帳戶存在天數 (假設最新日期為18250)
        df['account_age'] = 18250 - df['account_open_date']
        
        # 帳戶平均每日交易頻率
        if 'transaction_count' in df.columns:
            df['daily_txn_rate'] = df['transaction_count'] / (df['account_age'] + 1)
            # 處理極端值
            q99 = df['daily_txn_rate'].quantile(0.99)
            df['daily_txn_rate'] = df['daily_txn_rate'].clip(upper=q99)
    
    # 2. 交易頻率與金額比率特徵
    if all(col in df.columns for col in ['transaction_count', 'account_age']):
        df['transaction_frequency'] = df['transaction_count'] / (df['account_age'] / 365 + 0.001)
        # 處理極端值
        q99 = df['transaction_frequency'].quantile(0.99)
        df['transaction_frequency'] = df['transaction_frequency'].clip(upper=q99)
    
    if all(col in df.columns for col in ['avg_transaction_amount', 'aum_amt']):
        df['avg_txn_to_aum_ratio'] = df['avg_transaction_amount'] / (df['aum_amt'] + 1)
        # 處理極端值
        q99 = df['avg_txn_to_aum_ratio'].quantile(0.99)
        df['avg_txn_to_aum_ratio'] = df['avg_txn_to_aum_ratio'].clip(upper=q99)
    
    if all(col in df.columns for col in ['max_transaction_amount', 'aum_amt']):
        df['max_txn_to_aum_ratio'] = df['max_transaction_amount'] / (df['aum_amt'] + 1)
        # 處理極端值
        q99 = df['max_txn_to_aum_ratio'].quantile(0.99)
        df['max_txn_to_aum_ratio'] = df['max_txn_to_aum_ratio'].clip(upper=q99)
    
    if all(col in df.columns for col in ['max_transaction_amount', 'avg_transaction_amount']):
        # 最大交易與平均交易金額比率
        df['max_to_avg_ratio'] = df['max_transaction_amount'] / (df['avg_transaction_amount'] + 1)
        q99 = df['max_to_avg_ratio'].quantile(0.99)
        df['max_to_avg_ratio'] = df['max_to_avg_ratio'].clip(upper=q99)
    
    # 3. 交易行為異常指標
    if all(col in df.columns for col in ['std_transaction_amount', 'avg_transaction_amount']):
        df['txn_volatility'] = df['std_transaction_amount'] / (df['avg_transaction_amount'] + 1)
        # 處理極端值
        q99 = df['txn_volatility'].quantile(0.99)
        df['txn_volatility'] = df['txn_volatility'].clip(upper=q99)
    
    # 4. 入帳與出帳的比例
    if 'income_ratio' in df.columns:
        # 避免除以零
        outgoing_ratio = 1 - df['income_ratio']
        outgoing_ratio = outgoing_ratio.replace(0, 0.001)
        df['inout_ratio'] = df['income_ratio'] / outgoing_ratio
        # 處理極端值
        q99 = df['inout_ratio'].quantile(0.99)
        df['inout_ratio'] = df['inout_ratio'].clip(upper=q99)
    
    # 5. 銀行交易比例
    if all(col in df.columns for col in ['bank_txn_count', 'transaction_count']):
        df['bank_txn_ratio'] = df['bank_txn_count'] / (df['transaction_count'] + 0.001)
    
    # 6. 網路行動銀行使用比例
    if all(col in df.columns for col in ['mobile_banking_usage', 'ebanking_usage', 'transaction_count']):
        df['digital_banking_ratio'] = (df['mobile_banking_usage'] + df['ebanking_usage']) / (df['transaction_count'] + 0.001)
    
    # 7. IP和設備行為異常指標
    if all(col in df.columns for col in ['same_ip_ratio', 'same_device_ratio']):
        df['device_change_score'] = (1 - df['same_ip_ratio']) + (1 - df['same_device_ratio'])
        
        # IP變更頻率和設備變更頻率
        if 'transaction_count' in df.columns:
            df['ip_change_freq'] = (1 - df['same_ip_ratio']) * df['transaction_count']
            df['device_change_freq'] = (1 - df['same_device_ratio']) * df['transaction_count']
    
    # 8. 交易對象多樣性相關特徵
    if all(col in df.columns for col in ['counterparty_diversity', 'transaction_count']):
        df['counterparty_txn_ratio'] = df['counterparty_diversity'] / (df['transaction_count'] + 0.001)
    
    # 9. 交易渠道多樣性特徵
    channel_columns = [col for col in df.columns if col.startswith('channel_') and col.endswith('_count')]
    if channel_columns:
        # 計算使用的渠道數量
        df['channel_diversity'] = df[channel_columns].apply(lambda row: np.sum(row > 0), axis=1)
        # 計算最常用渠道的使用比例
        df['main_channel_ratio'] = df[channel_columns].apply(lambda row: np.max(row) / (np.sum(row) + 0.001), axis=1)
    
    # 10. 交易代碼多樣性
    code_columns = [col for col in df.columns if col.startswith('code_') and col.endswith('_count')]
    if code_columns:
        # 計算使用的交易代碼數量
        df['code_diversity'] = df[code_columns].apply(lambda row: np.sum(row > 0), axis=1)
        # 計算不尋常交易代碼的使用次數 (假設code_45至code_54為不尋常代碼)
        unusual_code_columns = [f'code_{i}_count' for i in range(45, 55) if f'code_{i}_count' in df.columns]
        if unusual_code_columns:
            df['unusual_code_usage'] = df[unusual_code_columns].sum(axis=1)
    
    return df

In [26]:
def handle_missing_and_outliers(df):
    """處理缺失值和異常值"""
    print("處理缺失值和異常值...")
    
    # 1. 處理數值型特徵的缺失值
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    print(f"發現 {len(numeric_columns)} 個數值型特徵")
    
    # 使用中位數填充基本特徵的缺失值
    for col in ['age', 'income_level', 'region_code']:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    
    # 2. 檢查並處理無限值和NaN
    for col in numeric_columns:
        # 檢查並替換無限值
        df[col] = df[col].replace([np.inf, -np.inf], np.nan)
        
        # 檢查NaN值並填充
        nan_count = df[col].isna().sum()
        if nan_count > 0:
            print(f"列 {col} 有 {nan_count} 個NaN值")
            df[col] = df[col].fillna(df[col].median() if df[col].notnull().any() else 0)
    
    # 3. 使用SimpleImputer處理剩餘缺失值
    imputer = SimpleImputer(strategy='median')
    df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
    
    # 4. 檢查是否還有NaN
    if df.isna().sum().sum() > 0:
        print(f"警告: 仍有 {df.isna().sum().sum()} 個NaN值，進行填充")
        df = df.fillna(0)
    else:
        print("所有NaN值已處理完成")
    
    return df

In [27]:
def remove_high_correlation_features(X):
    """移除高度相關特徵"""
    print("檢查和移除高度相關特徵...")
    
    # 計算特徵相關性
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_features = [column for column in upper.columns if any(upper[column] > 0.95)]
    
    # 顯示將被移除的高度相關特徵
    if high_corr_features:
        print(f"將移除 {len(high_corr_features)} 個高度相關特徵: {high_corr_features[:5]}{'...' if len(high_corr_features) > 5 else ''}")
        X = X.drop(high_corr_features, axis=1)
    else:
        print("未發現高度相關特徵")
    
    return X

In [28]:
def standardize_features(X_train, X_test):
    """標準化特徵"""
    print("標準化特徵...")
    
    # 使用健壯縮放器，更能處理異常值
    numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
    robust_scaler = RobustScaler()  # 使用健壯縮放代替標準縮放
    
    # 應用健壯縮放
    X_train_scaled = X_train.copy()
    X_train_scaled[numeric_features] = robust_scaler.fit_transform(X_train[numeric_features])
    
    # 測試集也進行健壯縮放
    X_test_scaled = X_test.copy()
    X_test_scaled[numeric_features] = robust_scaler.transform(X_test[numeric_features])
    
    return X_train_scaled, X_test_scaled, robust_scaler

In [44]:
account_info, customer_info, account_transactions, suspicious_accounts = load_data()
merged_data = merge_basic_data(account_info, customer_info)
transaction_features = create_transaction_features(account_transactions)
merged_data_with_txn = merge_transaction_features(merged_data, transaction_features)


載入原始數據...
Account Info 形狀: (24969, 5)
Account Transactions 形狀: (206333, 18)
Customer Info 形狀: (23655, 5)
Suspicious Accounts 形狀: (400, 3)
合併基本數據...
客戶-帳戶映射表形狀: (24969, 2)
合併客戶資訊後的形狀: (24969, 6)
合併帳戶資訊後的形狀: (24969, 9)
從交易數據提取特徵...
交易特徵的形狀: (24969, 85)
合併交易特徵到主數據集...
合併後的數據形狀: (24969, 93)


In [45]:
# --- 1. 載入交易資料 ---
txn = pd.read_csv('Train\\train_account_transactions.csv')

# --- 2. 找出每一帳號最新交易日，計算與之的差距 ---
latest_date = txn.groupby('account_number')['transaction_date'].max()
txn = txn.merge(latest_date.rename('latest_date'), on='account_number')
txn['days_diff'] = txn['latest_date'] - txn['transaction_date']

# --- 3. 定義「觀測視窗」統計函式 ---
def window_features(df, window):
    dfw = df[df['days_diff'] <= window]                       # 篩選近 7 / 30 日
    agg = {
        'transaction_amount': ['count', 'sum', 'mean', 'max'],
        'transaction_direction': ['mean'],                   # 1=入金,2=出金 → 比值概念
        'transaction_hour':  lambda x: (x.isin(
                             list(range(0,7))+[22,23])).mean(),  # 夜間交易比例
        'is_same_ip':       lambda x: (1 - x).mean(),         # IP 變動率
        'is_same_device':   lambda x: (1 - x).mean()          # 裝置變動率
    }
    grouped = dfw.groupby('account_number').agg(agg)
    # 攤平欄位 + 加上視窗後綴
    grouped.columns = ['_'.join([c[0], c[1] if isinstance(c[1], str) else 'lambda', f'{window}d'])
                       for c in grouped.columns]
    return grouped

feat7  = window_features(txn,  7)   # 近一週
feat30 = window_features(txn, 30)   # 近 30 天

# --- 4. 合併兩個視窗 ---
time_feats = feat7.join(feat30, how='outer').reset_index()    # 保留 account_number

# --- 5. 併回原始資料 ---
merged = (
    merged_data_with_txn            
    .merge(time_feats, on='account_number', how='left')
)

In [46]:
merged_data_with_txn

Unnamed: 0,account_number,customer_id,aum_amt,age,income_level,region_code,is_unreachable,is_digital_account,account_open_date,transaction_count,...,code_45_count,code_46_count,code_47_count,code_48_count,code_49_count,code_50_count,code_51_count,code_52_count,code_53_count,code_54_count
0,ACCT6068,ID5684,256930,61,25.0,12.0,0,0,8400,2,...,0,0,0,0,0,0,0,0,0,0
1,ACCT11459,ID10838,65,57,,12.0,0,0,9569,12,...,0,0,3,0,0,0,0,0,0,0
2,ACCT15832,ID15012,14438,56,126.0,12.0,0,0,8380,6,...,0,0,0,0,0,0,0,0,0,0
3,ACCT15612,ID14797,43872,46,,12.0,0,0,10610,2,...,0,0,0,0,0,0,0,0,0,0
4,ACCT18659,ID17677,2578166,72,25.0,12.0,0,0,9666,9,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24964,ACCT18619,ID17645,2843,15,25.0,12.0,0,0,18159,13,...,0,0,7,0,0,0,0,0,0,0
24965,ACCT20836,ID19721,142,20,25.0,12.0,0,0,17638,71,...,0,0,0,0,0,0,0,0,0,0
24966,ACCT24372,ID23132,474,20,25.0,12.0,0,0,17565,41,...,0,0,16,0,0,0,0,0,0,0
24967,ACCT501,ID463,175,20,25.0,12.0,0,1,18142,6,...,0,0,4,0,0,0,0,0,0,0


In [48]:
final_data = add_suspicious_flag(merged, suspicious_accounts)

添加可疑帳戶標記...
可疑帳戶數量: 400 (1.60%)
非可疑帳戶數量: 24569 (98.40%)


In [49]:
final_data = create_advanced_features(final_data)
final_data = handle_missing_and_outliers(final_data)

創建進階特徵...
處理缺失值和異常值...
發現 126 個數值型特徵
所有NaN值已處理完成


In [50]:
X = final_data.drop(['is_suspicious', 'account_number', 'customer_id'], axis=1)
y = final_data['is_suspicious']

## LightGBM + scale_pos_weight

In [83]:

# === 類別特徵轉型 ===
cat_cols = [c for c in X.columns if
            any(kw in c for kw in ['channel', 'code', 'weekday', 'region'])]  # ⚑ 自行確認
for col in cat_cols:
    X[col] = X[col].astype('category')

# === 切分訓練 / 驗證 / 測試 ===
from sklearn.model_selection import train_test_split
X_train, X_val,  y_train, y_val  = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)


In [89]:
print(cat_cols)

['region_code', 'channel_1_count', 'channel_2_count', 'channel_3_count', 'channel_4_count', 'channel_5_count', 'channel_6_count', 'channel_7_count', 'channel_8_count', 'channel_9_count', 'channel_10_count', 'channel_11_count', 'channel_12_count', 'channel_13_count', 'channel_14_count', 'channel_15_count', 'channel_16_count', 'channel_17_count', 'channel_18_count', 'channel_19_count', 'code_1_count', 'code_2_count', 'code_3_count', 'code_4_count', 'code_5_count', 'code_6_count', 'code_7_count', 'code_8_count', 'code_10_count', 'code_11_count', 'code_12_count', 'code_13_count', 'code_14_count', 'code_15_count', 'code_17_count', 'code_18_count', 'code_19_count', 'code_20_count', 'code_22_count', 'code_24_count', 'code_25_count', 'code_26_count', 'code_27_count', 'code_28_count', 'code_29_count', 'code_30_count', 'code_31_count', 'code_32_count', 'code_33_count', 'code_34_count', 'code_35_count', 'code_36_count', 'code_37_count', 'code_38_count', 'code_39_count', 'code_40_count', 'code_41_

In [84]:
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos          # ≈ 60 ~ 65 (依資料而異)
print(f"scale_pos_weight = {scale_pos_weight:.1f}")

scale_pos_weight = 61.4


In [85]:
from sklearn.metrics import f1_score
import lightgbm as lgb
lgbm_params = dict(
    objective='binary',
    # --- 失衡處理 ---
    scale_pos_weight = scale_pos_weight,
    # --- 基本結構 ---
    num_leaves        = 256,          # ⚑ 視資料量‑維度可下修
    max_depth         = -1,
    learning_rate     = 0.03,
    n_estimators      = 4000,         # 搭配 early‑stop
    min_data_in_leaf  = 20,
    subsample         = 0.8,
    colsample_bytree  = 0.8,
    random_state      = 42,
    verbosity         = -1
)

model = lgb.LGBMClassifier(**lgbm_params)

# --- 早停用 F1 / PR‑AUC 作指標 ---
def f1_eval_lgb(y_true, y_pred):
    y_pred_bin = (y_pred > 0.5).astype(int)
    return 'f1', f1_score(y_true, y_pred_bin), True         # True=越高越好

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric=f1_eval_lgb,                                # ⚑ 或 'average_precision'
    categorical_feature=cat_cols,
    callbacks=[lgb.early_stopping(200)]
)


Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[124]	valid_0's binary_logloss: 0.0331788	valid_0's f1: 0.681481


In [86]:
from sklearn.metrics import precision_recall_curve

y_val_prob = model.predict_proba(X_val)[:,1]
precision, recall, thresh = precision_recall_curve(y_val, y_val_prob)
f1 = 2*precision*recall / (precision+recall+1e-12)
best_thr = thresh[f1.argmax()]
print(f"Best threshold = {best_thr:.3f} ,  F1@val = {f1.max():.3f}")

Best threshold = 0.356 ,  F1@val = 0.717


In [80]:
num_cols = [c for c in X.columns if c not in cat_cols]
print(f"Numeric  : {len(num_cols)}\nCategorical: {len(cat_cols)}")


Numeric  : 51
Categorical: 74


In [87]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from imblearn.over_sampling import SMOTENC
from imblearn.combine        import SMOTETomek
from imblearn.pipeline       import Pipeline   # imblearn 的 Pipeline

# --- 2‑1 數值欄位做縮放 (RobustScaler 對極端值較友善)
numeric_tf   = Pipeline(steps=[('scaler', RobustScaler())])

# --- 2‑2 類別欄位保持 category，不需額外 encoding (LGBM 會吃)
categorical_tf = 'passthrough'

# --- 2‑3 合併前處理 ---
preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_tf , num_cols),
        ('cat', 'passthrough', cat_cols)
    ])

# --- 2‑4 資料增強器：直接計算 categorical 索引 ---
cat_start = len(num_cols)
cat_idx   = list(range(cat_start, cat_start + len(cat_cols)))  # << 唯一修正處

sampler = SMOTENC(
    categorical_features = cat_idx,
    random_state = 42,
    n_jobs = -1
)
# 若要用 SMOTETomek，也把 cat_idx 傳進裡層 SMOTENC
# sampler = SMOTETomek(
#     smote = SMOTENC(categorical_features=cat_idx, random_state=42, n_jobs=-1),
#     random_state = 42, n_jobs=-1
# )

# --- 2‑5 LightGBM ---
lgbm = lgb.LGBMClassifier(**lgbm_params)

# --- 2‑6 Pipeline ---
clf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('augment'   , sampler),
    ('model'     , lgbm)
])

In [88]:
# 3‑1 Fit
# 先算出 LGBM 看得到的「分類特徵索引」
cat_feature_idx = list(range(len(num_cols),
                             len(num_cols) + len(cat_cols)))

# 然後把 pipeline 裝好
clf = Pipeline([
    ('prep', preprocess),
    ('aug' , sampler) ,          # 如果 sampler is None 這行可拿掉
    ('model', lgb.LGBMClassifier(**lgbm_params))
])

# 這裡只改 1 行：用索引而不是欄名
clf.fit(X_train, y_train,
        model__eval_set              = [(X_val, y_val)],
        model__eval_metric           = f1_eval_lgb,
        model__categorical_feature   = cat_feature_idx,   # ← ★
        model__callbacks             = [lgb.early_stopping(200)])

# 3‑2 取得驗證機率
y_val_prob = clf.predict_proba(X_val)[:, 1]

# 3‑3 找最佳 threshold (同前)
precision, recall, thr = precision_recall_curve(y_val, y_val_prob)
f1  = 2*precision*recall/(precision+recall+1e-12)
best_thr = thr[f1.argmax()]
print(f"Val F1 = {f1.max():.3f} @ thr={best_thr:.3f}")


TypeError: Wrong type(str) or unknown name(region_code) in categorical_feature

## Augumentation

In [64]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek, SMOTEENN
from collections import Counter

def augment_data(X, y, method='smote', random_state=42):
    """
    進行資料增強以提高可疑帳戶的 F1 分數
    
    參數:
        X: 特徵資料集
        y: 目標變數 (is_suspicious)
        method: 增強方法
            - 'smote': 合成少數類過採樣技術
            - 'borderline_smote': 邊界 SMOTE (關注決策邊界附近的範例)
            - 'adasyn': 自適應合成採樣
            - 'smote_tomek': SMOTE + Tomek Links (過採樣+選擇性欠採樣)
            - 'smote_enn': SMOTE + ENN (過採樣+編輯最近鄰欠採樣)
            - 'hybrid': 自訂混合策略
        random_state: 隨機種子，用於重現結果
    
    返回:
        X_resampled, y_resampled: 重新採樣後的資料集
    """
    print(f"使用 {method} 進行資料增強...")
    print(f"原始類別分佈: {Counter(y)}")
    
    # 將資料拆分為訓練集和測試集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )
    
    if method == 'smote':
        # 標準 SMOTE
        sampling_strategy = 0.5  # 使少數類達到多數類的 50%
        sampler = SMOTE(sampling_strategy=sampling_strategy, random_state=random_state)
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    
    elif method == 'borderline_smote':
        # 邊界 SMOTE (更關注決策邊界附近的樣本)
        sampler = BorderlineSMOTE(
            sampling_strategy=0.5, 
            random_state=random_state,
            k_neighbors=5,
            m_neighbors=10
        )
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    
    elif method == 'adasyn':
        # ADASYN (根據局部密度自適應合成樣本)
        sampler = ADASYN(sampling_strategy=0.5, random_state=random_state)
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    
    elif method == 'smote_tomek':
        # SMOTE + Tomek Links (結合過採樣和選擇性欠採樣)
        sampler = SMOTETomek(sampling_strategy=0.5, random_state=random_state)
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    
    elif method == 'smote_enn':
        # SMOTE + ENN (過採樣後使用 ENN 清理可能的噪聲點)
        sampler = SMOTEENN(sampling_strategy=0.5, random_state=random_state)
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
    
    elif method == 'hybrid':
        # 自訂混合策略: 先進行欠採樣，再進行 SMOTE
        
        # 第一步: 對多數類進行欠採樣到其大小的 200%
        undersampler = RandomUnderSampler(
            sampling_strategy=lambda y: {0: int(Counter(y)[1] * 2), 1: Counter(y)[1]},
            random_state=random_state
        )
        X_intermediate, y_intermediate = undersampler.fit_resample(X_train, y_train)
        
        # 第二步: 使用 BorderlineSMOTE 生成邊界上的合成樣本
        oversampler = BorderlineSMOTE(
            sampling_strategy=0.8,  # 使少數類達到多數類的 80%
            random_state=random_state
        )
        X_train_resampled, y_train_resampled = oversampler.fit_resample(X_intermediate, y_intermediate)
    
    else:
        raise ValueError(f"未知的方法: {method}")
    
    # 合併重採樣後的訓練集和原始測試集
    X_resampled = pd.concat([pd.DataFrame(X_train_resampled), X_test.reset_index(drop=True)])
    y_resampled = pd.concat([pd.Series(y_train_resampled), y_test.reset_index(drop=True)])
    
    print(f"重採樣後的類別分佈: {Counter(y_resampled)}")
    return X_resampled, y_resampled


def apply_full_augmentation_pipeline(X_train, y_train, random_state=42):
    """
    應用完整的資料增強管道，包括多種技術的組合
    
    參數:
        X: 特徵資料集
        y: 目標變數 (is_suspicious)
        random_state: 隨機種子
    
    返回:
        X_augmented, y_augmented: 增強後的資料集
    """
    smote = SMOTE(sampling_strategy=0.25, random_state=random_state)
    X_balanced, y_balanced = smote.fit_resample(X_train, y_train)
    
    # 2. Add very subtle feature enhancement
    suspicious_indices = y_balanced[y_balanced == 1].index
    key_features = ['txn_volatility', 'max_txn_to_aum_ratio', 'device_change_score']
    
    X_enhanced = X_balanced.copy()
    for feature in key_features:
        if feature in X_enhanced.columns:
            std = X_enhanced[feature].std() * 0.2  # Much smaller enhancement
            X_enhanced.loc[suspicious_indices, feature] += np.random.normal(0, std, size=len(suspicious_indices))
    
    return X_enhanced, y_balanced


In [65]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [66]:
# 如果要比較不同增強方法的效果，可以使用以下代碼:
def compare_augmentation_methods(X, y, model_class, random_state=42):
    """
    比較不同資料增強方法的效果
    
    參數:
        X: 原始特徵資料集
        y: 原始目標變數
        model_class: 要使用的模型類別 (例如 RandomForestClassifier)
        random_state: 隨機種子
    
    返回:
        results: 包含各方法 F1 分數的字典
    """
    from sklearn.metrics import f1_score, classification_report
    
    # 要比較的方法
    methods = ['original', 'smote', 'borderline_smote', 'adasyn', 'smote_tomek', 'smote_enn', 'hybrid', 'full_pipeline']
    results = {}
    
    # 將資料拆分為訓練集和測試集 (使用相同的測試集進行公平比較)
    X_train_original, X_test, y_train_original, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )
    
    # 對於原始資料
    model = model_class(random_state=random_state)
    #model = RandomForestClassifier(random_state=42, class_weight='balanced')

    model.fit(X_train_original, y_train_original)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    results['original'] = f1
    print(f"原始資料 F1 分數: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    
    # 對於各種增強方法
    for method in methods[1:-1]:  # 跳過 'original' 和 'full_pipeline'
        # 應用增強方法
        X_resampled, y_resampled = augment_data(X_train_original, y_train_original, method=method, random_state=random_state)
        
        # 訓練模型
        model = model_class(random_state=random_state)
        model.fit(X_resampled, y_resampled)
        
        # 評估模型
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        results[method] = f1
        print(f"{method} F1 分數: {f1:.4f}")
        print(classification_report(y_test, y_pred))
    
    # 對於完整管道
    X_full_pipeline, y_full_pipeline = apply_full_augmentation_pipeline(X_train_original, y_train_original, random_state=random_state)
    model = model_class(random_state=random_state)
    model.fit(X_full_pipeline, y_full_pipeline)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    results['full_pipeline'] = f1
    print(f"完整管道 F1 分數: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    
    return results

# 比較不同方法 (可以根據需要取消註釋並使用)


In [None]:
results = compare_augmentation_methods(X, y, RandomForestClassifier, random_state=42)

原始資料 F1 分數: 0.6034
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      4914
         1.0       0.97      0.44      0.60        80

    accuracy                           0.99      4994
   macro avg       0.98      0.72      0.80      4994
weighted avg       0.99      0.99      0.99      4994

使用 smote 進行資料增強...
原始類別分佈: Counter({0.0: 19655, 1.0: 320})
重採樣後的類別分佈: Counter({0.0: 19655, 1.0: 7926})
smote F1 分數: 0.6769
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      4914
         1.0       0.88      0.55      0.68        80

    accuracy                           0.99      4994
   macro avg       0.94      0.77      0.84      4994
weighted avg       0.99      0.99      0.99      4994

使用 borderline_smote 進行資料增強...
原始類別分佈: Counter({0.0: 19655, 1.0: 320})
重採樣後的類別分佈: Counter({0.0: 19655, 1.0: 7926})


KeyboardInterrupt: 

In [55]:
def augment_suspicious_features(X, y, suspicious_threshold=0.7):
    """
    針對可疑帳戶特徵進行增強
    
    此函數針對已識別為可疑的帳戶進行特徵增強，使其模式更加明顯
    
    參數:
        X: 特徵資料集
        y: 目標變數 (is_suspicious)
        suspicious_threshold: 增強的強度 (0-1)
    
    返回:
        X_augmented: 增強後的特徵
    """
    print("進行可疑特徵增強...")
    X_augmented = X.copy()
    
    # 取得可疑帳戶的索引
    suspicious_indices = y[y == 1].index
    
    # 找出最具分辨力的特徵 (可基於領域知識增加或修改)
    key_features = [
        'txn_volatility', 'max_txn_to_aum_ratio', 'max_to_avg_ratio',
        'inout_ratio', 'device_change_score', 'ip_change_freq',
        'counterparty_txn_ratio', 'unusual_code_usage'
    ]
    
    # 確保這些特徵存在於資料集中
    valid_features = [f for f in key_features if f in X.columns]
    
    if not valid_features:
        print("警告: 找不到關鍵的可疑特徵，跳過特徵增強")
        return X_augmented
    
    # 計算每個特徵的中位數和 IQR
    medians = X[valid_features].median()
    q75 = X[valid_features].quantile(0.75)
    q25 = X[valid_features].quantile(0.25)
    iqrs = q75 - q25
    
    # 對可疑帳戶的特徵進行增強
    for feature in valid_features:
        # 針對不同類型的特徵選擇增強方向
        if feature in ['txn_volatility', 'max_txn_to_aum_ratio', 'max_to_avg_ratio',
                       'device_change_score', 'ip_change_freq', 'unusual_code_usage']:
            # 這些特徵越高越可疑，向上增強
            enhancement = iqrs[feature] * suspicious_threshold
            X_augmented.loc[suspicious_indices, feature] += np.random.uniform(0, enhancement, size=len(suspicious_indices))
        
        elif feature in ['same_ip_ratio', 'same_device_ratio']:
            # 這些特徵越低越可疑，向下增強
            enhancement = iqrs[feature] * suspicious_threshold
            X_augmented.loc[suspicious_indices, feature] -= np.random.uniform(0, enhancement, size=len(suspicious_indices))
            # 確保不低於0
            X_augmented.loc[suspicious_indices, feature] = X_augmented.loc[suspicious_indices, feature].clip(lower=0)
    
    print(f"已增強 {len(suspicious_indices)} 個可疑帳戶的特徵")
    return X_augmented

def add_noise_to_majority(X, y, noise_level=0.05):
    """
    給多數類添加微小噪聲，幫助模型更好地識別決策邊界
    
    參數:
        X: 特徵資料集
        y: 目標變數 (is_suspicious)
        noise_level: 噪聲等級 (0-1)
    
    返回:
        X_with_noise: 添加噪聲後的特徵
    """
    print("對多數類添加微小噪聲...")
    X_with_noise = X.copy()
    
    # 取得非可疑帳戶的索引
    normal_indices = y[y == 0].index
    
    # 為所有數值型特徵添加噪聲
    numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
    
    for col in numeric_cols:
        # 計算特徵的標準差
        std = X[col].std()
        
        # 根據特徵標準差添加高斯噪聲
        noise = np.random.normal(0, std * noise_level, size=len(normal_indices))
        X_with_noise.loc[normal_indices, col] += noise
    
    print(f"已對 {len(normal_indices)} 個非可疑帳戶添加微小噪聲")
    return X_with_noise

def generate_synthetic_suspicious(X, y, n_samples=100, random_state=42):
    """
    生成合成的可疑帳戶樣本，基於已知可疑帳戶的特徵分佈
    
    參數:
        X: 特徵資料集
        y: 目標變數 (is_suspicious)
        n_samples: 要生成的合成樣本數量
        random_state: 隨機種子
    
    返回:
        X_synthetic, y_synthetic: 合成的樣本及其標籤
    """
    print(f"生成 {n_samples} 個合成可疑帳戶...")
    
    # 提取所有可疑帳戶
    suspicious_samples = X[y == 1]
    
    # 如果可疑樣本太少，則使用 SMOTE 生成更多樣本
    if len(suspicious_samples) < 5:
        print("可疑樣本數量太少，無法直接生成合成樣本，使用 SMOTE 生成中間樣本")
        smote = SMOTE(random_state=random_state)
        X_intermediate, y_intermediate = smote.fit_resample(X, y)
        suspicious_samples = X_intermediate[y_intermediate == 1]
    
    # 計算每個特徵的均值和協方差
    numeric_cols = suspicious_samples.select_dtypes(include=['float64', 'int64']).columns
    
    # 僅使用數值型特徵
    suspicious_numeric = suspicious_samples[numeric_cols]
    
    # 計算均值和協方差
    mean_vector = suspicious_numeric.mean().values
    cov_matrix = suspicious_numeric.cov().values
    
    # 使用多變量高斯分佈生成合成樣本
    np.random.seed(random_state)
    synthetic_data = np.random.multivariate_normal(mean_vector, cov_matrix, n_samples)
    
    # 轉換為 DataFrame 並添加列名
    X_synthetic = pd.DataFrame(synthetic_data, columns=numeric_cols)
    
    # 對於任何可能的非數值型特徵，我們從原始可疑樣本中隨機取樣
    non_numeric_cols = [col for col in X.columns if col not in numeric_cols]
    
    if non_numeric_cols:
        for col in non_numeric_cols:
            random_values = suspicious_samples[col].sample(n=n_samples, replace=True, random_state=random_state).values
            X_synthetic[col] = random_values
    
    # 所有合成樣本都標記為可疑 (1)
    y_synthetic = pd.Series([1] * n_samples)
    
    return X_synthetic, y_synthetic