### 0. Import Libraries

In [210]:
import pandas as pd
import copy
from tqdm import tqdm
from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://192.168.1.20:1318/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

USE_MODEL = "pixtral"
MODEL_NAME_DICT = {"qwen": "Qwen/Qwen2-VL-72B-Instruct",
                   "pixtral": "mistralai/Pixtral-Large-Instruct-2411"}
MODEL_NAME = MODEL_NAME_DICT[USE_MODEL]
SYSTEM_PROMPT_DICT = {"qwen": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
                      "pixtral": "You are a helpful assistant."} # TODO : pixtral 프롬프트 지정 필요
SYSTEM_PROMPT = SYSTEM_PROMPT_DICT[USE_MODEL]
def LLM_Call(prompt:str):
    chat_response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"{prompt}"},
        ],
        temperature=0.001,
        top_p=0.001,
        max_tokens=4096,
        extra_body={
            "repetition_penalty": 1.03,
        },
    )
    return chat_response.choices[0].message.content

In [297]:
import re
import ast

def extract_data_dict(result):
    pattern = re.compile(r"(?<=python\n)\{[\s\S]*?\}")
    match = pattern.search(result)
    if match:
        data_str = match.group(0)
        data_dict = ast.literal_eval(data_str)
        return data_dict
    else:
        return None
    
def get_attack_type_samples(df:pd.DataFrame,
                          attack_type:str,
                          num_samples:int=25):
    ATTACK_TYPE_SAMPLES = df[df['Attack type']==attack_type].sample(num_samples)
    ATTACK_TYPE_SAMPLES.columns = ATTACK_TYPE_SAMPLES.columns.str.strip()
    ATTACK_TYPE_SAMPLES.reset_index(drop=True, inplace=True)
    ATTACK_TYPE_SAMPLES.drop(columns=["Attack type"], inplace=True)
    attack_type_md = ATTACK_TYPE_SAMPLES.to_markdown()
    return attack_type_md

desc_dict = {
    "id": "A unique ID to distinguish the sensor node in any round and at any stage. For example, node number 25 in the third round and in the first stage is to be symbolized as 001 003 025.",
    "Time": "The current simulation time of the node.",
    "Is_CH": "A flag to distinguish whether the node is CH with value 1 or normal node with value 0.",
    "who CH": "The ID of the CH in the current round.",
    "Dist_To_CH": "The distance between the node and its CH in the current round.",
    "ADV_S": "The number of advertise CH’s broadcast messages sent to the nodes.",
    "ADV_R": "The number of advertise CH messages received from CHs.",
    "JOIN_S": "The number of join request messages sent by the nodes to the CH.",
    "JOIN_R": "The number of join request messages received by the CH from the nodes.",
    "SCH_S": "The number of advertise TDMA schedule broadcast messages sent to the nodes.",
    "SCH_R": "The number of TDMA schedule messages received from CHs.",
    "Rank": "The order of this node within the TDMA schedule.",
    "DATA_S": "The number of data packets sent from a sensor to its CH.",
    "DATA_R": "The number of data packets received from CH.",
    "Data_Sent_To_BS": "The number of data packets sent to the BS.",
    "dist_CH_To_BS": "The distance between the CH and the BS.",
    "send_code": "The cluster sending code.",
    "Expaned Energy": "The amount of energy consumed in the previous round.",
}
description_md = "| Key           | Description |\n"
description_md += "|-------------------|-----------------|\n"
for key, description in desc_dict.items():
    description_md += f"| {key} | {description} |\n"
    
returns_format = """```python
{
    'id': value,
    'Time': value,
    'Is_CH': value,
    'who CH': value,
    'Dist_To_CH': value,
    'ADV_S': value,
    'ADV_R': value,
    'JOIN_S': value,
    'JOIN_R': value,
    'SCH_S': value,
    'SCH_R': value,
    'Rank': value,
    'DATA_S': value,
    'DATA_R': value,
    'Data_Sent_To_BS': value,
    'dist_CH_To_BS': value,
    'send_code': value,
    'Expaned Energy': value
}
```
"""

In [298]:
BASE_PROMPT = """
당신은 데이터 생성 전문가이다.

WSN-DS (Wireless Sensor Network Detection System) 데이터셋에서
Blackhole 공격 유형에 대해 데이터를 생성하고자 한다.

아래 [Description]과 [Examples]를 참고하여 데이터를 한개 생성하라. 

[Description]
{description_md}

[Examples]
{attack_type_md}

데이터는 딕셔너리 형태로, examples 예시와 동일한 key, value 형태를 참고해서 아래 [Returns] 형태로 생성하라. 

[Returns]
{returns_format}
"""

In [295]:
wsn_df = pd.read_csv("WSN-DS.csv")

In [260]:
ATTACK_TYPES = ['Grayhole', 'Blackhole', 'TDMA', 'Flooding']
new_attack_type_samples = []
for attack_type in ATTACK_TYPES:
    for i in tqdm(range(150)):
        PROMPT = BASE_PROMPT.format(description_md=description_md, 
                                    attack_type_md=get_attack_type_samples(df=wsn_df, 
                                                                        attack_type=attack_type, 
                                                                        num_samples=25), 
                                    returns_format=returns_format)
        result = LLM_Call(PROMPT)
        data_dict = extract_data_dict(result)
        if data_dict is not None:
            data_dict['Attack type'] = attack_type
            new_attack_type_samples.append(data_dict)
        else:
            print(f"Failed to extract data_dict from result {i}")
new_attack_type_samples = pd.DataFrame(new_attack_type_samples)
new_attack_type_samples.to_csv("new_attack_type_samples_ver2.csv", index=False)

100%|██████████| 150/150 [20:48<00:00,  8.32s/it]
100%|██████████| 150/150 [20:29<00:00,  8.20s/it]
100%|██████████| 150/150 [21:23<00:00,  8.56s/it]
100%|██████████| 150/150 [18:32<00:00,  7.41s/it]


In [293]:
normal_df = wsn_df[wsn_df['Attack type']=="Normal"].sample(400)
normal_df.columns = normal_df.columns.str.strip()
concat_original_df = copy.deepcopy(normal_df)
for attack_type in ATTACK_TYPES:
    attack_type_df = wsn_df[wsn_df['Attack type']==attack_type].sample(200)
    attack_type_df.columns = attack_type_df.columns.str.strip()
    concat_original_df = pd.concat([concat_original_df, attack_type_df], ignore_index=True)
    
new_attack_type_df = pd.read_csv(f"new_attack_type_samples.csv")
new_attack_type_df.columns = new_attack_type_df.columns.str.strip()
concat_all_df = pd.concat([concat_original_df,new_attack_type_df], ignore_index=True)

In [294]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

X_sample_based_LLM = concat_all_df.drop(columns=["Attack type"])
Y_sample_based_LLM = concat_all_df["Attack type"]
# print(X_sample_based_LLM.shape, Y_sample_based_LLM.shape)
X_train_based_LLM, X_test_based_LLM, y_train_based_LLM, y_test_based_LLM = train_test_split(X_sample_based_LLM, Y_sample_based_LLM, test_size=0.2, random_state=42)
scaler_resampling = StandardScaler()
X_train_scaled = scaler_resampling.fit_transform(X_train_based_LLM)
X_test_scaled = scaler_resampling.transform(X_test_based_LLM)
logreg_resampling = LogisticRegression(max_iter=1000, random_state=42)
logreg_resampling.fit(X_train_scaled, y_train_based_LLM)
y_pred_based_LLM = logreg_resampling.predict(X_test_scaled)
print("[LLM-based Augmentation Result]")
print("Accuracy:", accuracy_score(y_test_based_LLM, y_pred_based_LLM))
print("\nClassification Report:\n", classification_report(y_test_based_LLM, y_pred_based_LLM))

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_sample_based_SMOTE = concat_original_df.drop(columns=["Attack type"])
Y_sample_based_SMOTE = concat_original_df["Attack type"]
X_SMOTE, Y_SMOTE = smote.fit_resample(X_sample_based_SMOTE, Y_sample_based_SMOTE)
# print(X_SMOTE.shape, Y_SMOTE.shape)

X_train_based_SMOTE, X_test_based_SMOTE, y_train_based_SMOTE, y_test_based_SMOTE = train_test_split(X_SMOTE, Y_SMOTE, test_size=0.2, random_state=42)
scaler_resampling = StandardScaler()
X_train_scaled = scaler_resampling.fit_transform(X_train_based_SMOTE)
X_test_scaled = scaler_resampling.transform(X_test_based_SMOTE)
logreg_resampling = LogisticRegression(max_iter=1000, random_state=42)
logreg_resampling.fit(X_train_scaled, y_train_based_SMOTE)
y_pred_based_SMOTE = logreg_resampling.predict(X_test_scaled)
print("[SMOTE-based Augmentation Result]")
print("Accuracy:", accuracy_score(y_test_based_SMOTE, y_pred_based_SMOTE))
print("\nClassification Report:\n", classification_report(y_test_based_SMOTE, y_pred_based_SMOTE))


[LLM-based Augmentation Result]
Accuracy: 0.935

Classification Report:
               precision    recall  f1-score   support

   Blackhole       0.78      1.00      0.88        85
    Flooding       1.00      0.99      0.99        75
    Grayhole       0.97      0.72      0.82        85
      Normal       1.00      0.99      0.99        83
        TDMA       1.00      1.00      1.00        72

    accuracy                           0.94       400
   macro avg       0.95      0.94      0.94       400
weighted avg       0.95      0.94      0.93       400

[SMOTE-based Augmentation Result]
Accuracy: 0.9175

Classification Report:
               precision    recall  f1-score   support

   Blackhole       0.70      1.00      0.83        69
    Flooding       1.00      1.00      1.00        78
    Grayhole       0.98      0.68      0.81        94
      Normal       0.98      0.99      0.98        83
        TDMA       0.99      0.97      0.98        76

    accuracy                        