<a href="https://colab.research.google.com/github/YannnLing/CHN_REC/blob/main/LLM%E7%B0%A1%E6%98%93%E5%B0%88%E6%A1%88.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import pandas as pd
import numpy as np
from sklearn.ensemble import  IsolationForest
from transformers import pipeline

In [8]:
# step1:模擬金融交易數據
# 模擬一些交易數據，包含交易ID、金額、時間、類型
data = {
    "transaction_id":range(1,101),
    "amount":np.random.normal(loc = 1000, scale = 300, size = 100),
    "timestamp":pd.date_range("2023-01-01",periods = 100, freq = "H"),
    "transaction_type":np.random.choice(["purchase","transfer","withdrawal"],100),
}


In [9]:
df = pd.DataFrame(data)
df

Unnamed: 0,transaction_id,amount,timestamp,transaction_type
0,1,784.360852,2023-01-01 00:00:00,withdrawal
1,2,1366.293560,2023-01-01 01:00:00,purchase
2,3,1264.559868,2023-01-01 02:00:00,transfer
3,4,1404.760641,2023-01-01 03:00:00,withdrawal
4,5,1092.345701,2023-01-01 04:00:00,withdrawal
...,...,...,...,...
95,96,1283.289813,2023-01-04 23:00:00,withdrawal
96,97,675.050969,2023-01-05 00:00:00,transfer
97,98,1486.294575,2023-01-05 01:00:00,transfer
98,99,677.872343,2023-01-05 02:00:00,transfer


In [10]:
# step2:使用 Isolation Forest 進行異常檢測
# 我們假設異常的交易可能是金額極端高或低的交易
clf = IsolationForest(contamination = 0.05)

In [12]:
df["anomaly_score"] = clf.fit_predict(df[["amount"]])
df

Unnamed: 0,transaction_id,amount,timestamp,transaction_type,anomaly_score
0,1,784.360852,2023-01-01 00:00:00,withdrawal,1
1,2,1366.293560,2023-01-01 01:00:00,purchase,1
2,3,1264.559868,2023-01-01 02:00:00,transfer,1
3,4,1404.760641,2023-01-01 03:00:00,withdrawal,1
4,5,1092.345701,2023-01-01 04:00:00,withdrawal,1
...,...,...,...,...,...
95,96,1283.289813,2023-01-04 23:00:00,withdrawal,1
96,97,675.050969,2023-01-05 00:00:00,transfer,1
97,98,1486.294575,2023-01-05 01:00:00,transfer,-1
98,99,677.872343,2023-01-05 02:00:00,transfer,1


In [14]:
# 標記異常交易
df['is_anomaly'] = df['anomaly_score'] == -1
df

Unnamed: 0,transaction_id,amount,timestamp,transaction_type,anomaly_score,is_anomaly
0,1,784.360852,2023-01-01 00:00:00,withdrawal,1,False
1,2,1366.293560,2023-01-01 01:00:00,purchase,1,False
2,3,1264.559868,2023-01-01 02:00:00,transfer,1,False
3,4,1404.760641,2023-01-01 03:00:00,withdrawal,1,False
4,5,1092.345701,2023-01-01 04:00:00,withdrawal,1,False
...,...,...,...,...,...,...
95,96,1283.289813,2023-01-04 23:00:00,withdrawal,1,False
96,97,675.050969,2023-01-05 00:00:00,transfer,1,False
97,98,1486.294575,2023-01-05 01:00:00,transfer,-1,True
98,99,677.872343,2023-01-05 02:00:00,transfer,1,False


In [17]:
# Step 3: 使用 LLM (GPT-2) 來分析交易並給出風險評估
# 使用 transformers 進行交易的文本風險分析
risk_analysis_model = pipeline("text-generation", model="gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [19]:
def analyze_risk(transaction):
    prompt = (
        f"Transaction ID: {transaction['transaction_id']}, Amount: {transaction['amount']:.2f}, "
        f"Transaction Type: {transaction['transaction_type']}. "
        "Is this transaction suspicious? Provide a reason."
    )
    analysis = risk_analysis_model(prompt, max_length=50, num_return_sequences=1)
    return analysis[0]['generated_text']

# 應用 LLM 分析到每筆異常交易
df['risk_analysis'] = df.apply(lambda row: analyze_risk(row) if row['is_anomaly'] else "Normal transaction", axis=1)
df

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Unnamed: 0,transaction_id,amount,timestamp,transaction_type,anomaly_score,is_anomaly,risk_analysis
0,1,784.360852,2023-01-01 00:00:00,withdrawal,1,False,Normal transaction
1,2,1366.293560,2023-01-01 01:00:00,purchase,1,False,Normal transaction
2,3,1264.559868,2023-01-01 02:00:00,transfer,1,False,Normal transaction
3,4,1404.760641,2023-01-01 03:00:00,withdrawal,1,False,Normal transaction
4,5,1092.345701,2023-01-01 04:00:00,withdrawal,1,False,Normal transaction
...,...,...,...,...,...,...,...
95,96,1283.289813,2023-01-04 23:00:00,withdrawal,1,False,Normal transaction
96,97,675.050969,2023-01-05 00:00:00,transfer,1,False,Normal transaction
97,98,1486.294575,2023-01-05 01:00:00,transfer,-1,True,"Transaction ID: 98, Amount: 1486.29, Transacti..."
98,99,677.872343,2023-01-05 02:00:00,transfer,1,False,Normal transaction


In [21]:
# Step 4: 顯示帶有風險分析的異常交易
anomalies = df[df['is_anomaly']]
anomalies

Unnamed: 0,transaction_id,amount,timestamp,transaction_type,anomaly_score,is_anomaly,risk_analysis
20,21,1646.128761,2023-01-01 20:00:00,withdrawal,-1,True,"Transaction ID: 21, Amount: 1646.13, Transacti..."
67,68,117.509793,2023-01-03 19:00:00,purchase,-1,True,"Transaction ID: 68, Amount: 117.51, Transactio..."
76,77,1772.09627,2023-01-04 04:00:00,transfer,-1,True,"Transaction ID: 77, Amount: 1772.10, Transacti..."
80,81,400.435152,2023-01-04 08:00:00,withdrawal,-1,True,"Transaction ID: 81, Amount: 400.44, Transactio..."
97,98,1486.294575,2023-01-05 01:00:00,transfer,-1,True,"Transaction ID: 98, Amount: 1486.29, Transacti..."


In [22]:
# 輸出結果
print(anomalies[['transaction_id', 'amount', 'transaction_type', 'risk_analysis']])

    transaction_id       amount transaction_type  \
20              21  1646.128761       withdrawal   
67              68   117.509793         purchase   
76              77  1772.096270         transfer   
80              81   400.435152       withdrawal   
97              98  1486.294575         transfer   

                                        risk_analysis  
20  Transaction ID: 21, Amount: 1646.13, Transacti...  
67  Transaction ID: 68, Amount: 117.51, Transactio...  
76  Transaction ID: 77, Amount: 1772.10, Transacti...  
80  Transaction ID: 81, Amount: 400.44, Transactio...  
97  Transaction ID: 98, Amount: 1486.29, Transacti...  
