### 0. Import Libraries

In [1]:
import pandas as pd
import copy
from tqdm import tqdm
from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://192.168.1.20:1318/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

USE_MODEL = "pixtral"
MODEL_NAME_DICT = {"qwen": "Qwen/Qwen2-VL-72B-Instruct",
                   "pixtral": "mistralai/Pixtral-Large-Instruct-2411"}
MODEL_NAME = MODEL_NAME_DICT[USE_MODEL]
SYSTEM_PROMPT_DICT = {"qwen": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
                      "pixtral": "You are a helpful assistant."} # TODO : pixtral 프롬프트 지정 필요
SYSTEM_PROMPT = SYSTEM_PROMPT_DICT[USE_MODEL]
def LLM_Call(prompt:str):
    chat_response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f"{prompt}"},
        ],
        temperature=0.001,
        top_p=0.001,
        max_tokens=4096,
        extra_body={
            "repetition_penalty": 1.03,
        },
    )
    return chat_response.choices[0].message.content

In [2]:
import re
import ast

def extract_data_dict(result):
    pattern = re.compile(r"(?<=python\n)\{[\s\S]*?\}")
    match = pattern.search(result)
    if match:
        data_str = match.group(0)
        data_dict = ast.literal_eval(data_str)
        return data_dict
    else:
        return None
    
def get_attack_type_samples(df:pd.DataFrame,
                          attack_type:str,
                          num_samples:int=25):
    ATTACK_TYPE_SAMPLES = df[df['Attack type']==attack_type].sample(num_samples)
    ATTACK_TYPE_SAMPLES.columns = ATTACK_TYPE_SAMPLES.columns.str.strip()
    ATTACK_TYPE_SAMPLES.reset_index(drop=True, inplace=True)
    ATTACK_TYPE_SAMPLES.drop(columns=["Attack type"], inplace=True)
    attack_type_md = ATTACK_TYPE_SAMPLES.to_markdown()
    return attack_type_md

desc_dict = {
    "id": "A unique ID to distinguish the sensor node in any round and at any stage. For example, node number 25 in the third round and in the first stage is to be symbolized as 001 003 025.",
    "Time": "The current simulation time of the node.",
    "Is_CH": "A flag to distinguish whether the node is CH with value 1 or normal node with value 0.",
    "who CH": "The ID of the CH in the current round.",
    "Dist_To_CH": "The distance between the node and its CH in the current round.",
    "ADV_S": "The number of advertise CH’s broadcast messages sent to the nodes.",
    "ADV_R": "The number of advertise CH messages received from CHs.",
    "JOIN_S": "The number of join request messages sent by the nodes to the CH.",
    "JOIN_R": "The number of join request messages received by the CH from the nodes.",
    "SCH_S": "The number of advertise TDMA schedule broadcast messages sent to the nodes.",
    "SCH_R": "The number of TDMA schedule messages received from CHs.",
    "Rank": "The order of this node within the TDMA schedule.",
    "DATA_S": "The number of data packets sent from a sensor to its CH.",
    "DATA_R": "The number of data packets received from CH.",
    "Data_Sent_To_BS": "The number of data packets sent to the BS.",
    "dist_CH_To_BS": "The distance between the CH and the BS.",
    "send_code": "The cluster sending code.",
    "Expaned Energy": "The amount of energy consumed in the previous round.",
}
description_md = "| Key           | Description |\n"
description_md += "|-------------------|-----------------|\n"
for key, description in desc_dict.items():
    description_md += f"| {key} | {description} |\n"
    
returns_format = """```python
{
    'id': value,
    'Time': value,
    'Is_CH': value,
    'who CH': value,
    'Dist_To_CH': value,
    'ADV_S': value,
    'ADV_R': value,
    'JOIN_S': value,
    'JOIN_R': value,
    'SCH_S': value,
    'SCH_R': value,
    'Rank': value,
    'DATA_S': value,
    'DATA_R': value,
    'Data_Sent_To_BS': value,
    'dist_CH_To_BS': value,
    'send_code': value,
    'Expaned Energy': value
}
```
"""

In [3]:
BASE_PROMPT = """
당신은 데이터 생성 전문가이다.

WSN-DS (Wireless Sensor Network Detection System) 데이터셋에서
Blackhole 공격 유형에 대해 데이터를 생성하고자 한다.

아래 [Description]과 [Examples]를 참고하여 데이터를 한개 생성하라. 

[Description]
{description_md}

[Examples]
{attack_type_md}

데이터는 딕셔너리 형태로, examples 예시와 동일한 key, value 형태를 참고해서 아래 [Returns] 형태로 생성하라. 

[Returns]
{returns_format}
"""

In [36]:
ATTACK_TYPES = ['Grayhole', 'Blackhole', 'TDMA', 'Flooding']

In [37]:
wsn_df = pd.read_csv("WSN-DS.csv")

In [21]:
# new_attack_type_samples = []
# for attack_type in ATTACK_TYPES:
#     for i in tqdm(range(150)):
#         PROMPT = BASE_PROMPT.format(description_md=description_md, 
#                                     attack_type_md=get_attack_type_samples(df=wsn_df, 
#                                                                            attack_type=attack_type, 
#                                                                            num_samples=25), 
#                                     returns_format=returns_format)
#         result = LLM_Call(PROMPT)
#         data_dict = extract_data_dict(result)
#         if data_dict is not None:
#             data_dict['Attack type'] = attack_type
#             new_attack_type_samples.append(data_dict)
#         else:
#             print(f"Failed to extract data_dict from result {i}")
# new_attack_type_samples = pd.DataFrame(new_attack_type_samples)
# new_attack_type_samples.to_csv("new_attack_type_samples_ver2.csv", index=False)

In [22]:
normal_df = wsn_df[wsn_df['Attack type']=="Normal"].sample(400,random_state=42)
normal_df.columns = normal_df.columns.str.strip()
concat_original_df = copy.deepcopy(normal_df)
for attack_type in ATTACK_TYPES:
    attack_type_df = wsn_df[wsn_df['Attack type']==attack_type].sample(200,random_state=42)
    attack_type_df.columns = attack_type_df.columns.str.strip()
    concat_original_df = pd.concat([concat_original_df, attack_type_df], ignore_index=True)
    
new_attack_type_df = pd.read_csv(f"augmented_with_LLM.csv")
new_attack_type_df.columns = new_attack_type_df.columns.str.strip()
concat_all_df = pd.concat([concat_original_df,new_attack_type_df], ignore_index=True)

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

X_sample_based_LLM = concat_all_df.drop(columns=["Attack type"])
Y_sample_based_LLM = concat_all_df["Attack type"]
# print(X_sample_based_LLM.shape, Y_sample_based_LLM.shape)
X_train_based_LLM, X_test_based_LLM, y_train_based_LLM, y_test_based_LLM = train_test_split(X_sample_based_LLM, Y_sample_based_LLM, test_size=0.2, random_state=42)
scaler_resampling = StandardScaler()
X_train_scaled = scaler_resampling.fit_transform(X_train_based_LLM)
X_test_scaled = scaler_resampling.transform(X_test_based_LLM)
logreg_resampling = LogisticRegression(max_iter=1000, random_state=42)
logreg_resampling.fit(X_train_scaled, y_train_based_LLM)
y_pred_based_LLM = logreg_resampling.predict(X_test_scaled)
print("[LLM-based Augmentation Result]")
print("Accuracy:", accuracy_score(y_test_based_LLM, y_pred_based_LLM))
print("\nClassification Report:\n", classification_report(y_test_based_LLM, y_pred_based_LLM))

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_sample_based_SMOTE = concat_original_df.drop(columns=["Attack type"])
Y_sample_based_SMOTE = concat_original_df["Attack type"]
X_SMOTE, Y_SMOTE = smote.fit_resample(X_sample_based_SMOTE, Y_sample_based_SMOTE)
original_data_size = len(X_sample_based_SMOTE)
X_augmented = X_SMOTE[original_data_size:]  # 증강된 데이터만 선택
Y_augmented = Y_SMOTE[original_data_size:]  # 증강된 라벨만 선택
smote_df = pd.DataFrame(X_augmented, columns=X_sample_based_SMOTE.columns)
smote_df["Attack type"] = Y_augmented
smote_df.to_csv("augmented_with_smote_only.csv", index=False)

X_train_based_SMOTE, X_test_based_SMOTE, y_train_based_SMOTE, y_test_based_SMOTE = train_test_split(X_SMOTE, Y_SMOTE, test_size=0.2, random_state=42)
scaler_resampling = StandardScaler()
X_train_scaled = scaler_resampling.fit_transform(X_train_based_SMOTE)
X_test_scaled = scaler_resampling.transform(X_test_based_SMOTE)
logreg_resampling = LogisticRegression(max_iter=1000, random_state=42)
logreg_resampling.fit(X_train_scaled, y_train_based_SMOTE)
y_pred_based_SMOTE = logreg_resampling.predict(X_test_scaled)
print("[SMOTE-based Augmentation Result]")
print("Accuracy:", accuracy_score(y_test_based_SMOTE, y_pred_based_SMOTE))
print("\nClassification Report:\n", classification_report(y_test_based_SMOTE, y_pred_based_SMOTE))


[LLM-based Augmentation Result]
Accuracy: 0.9075

Classification Report:
               precision    recall  f1-score   support

   Blackhole       0.75      0.96      0.84        85
    Flooding       1.00      0.99      0.99        75
    Grayhole       0.95      0.67      0.79        85
      Normal       0.95      0.99      0.97        83
        TDMA       0.97      0.94      0.96        72

    accuracy                           0.91       400
   macro avg       0.92      0.91      0.91       400
weighted avg       0.92      0.91      0.91       400

[SMOTE-based Augmentation Result]
Accuracy: 0.91

Classification Report:
               precision    recall  f1-score   support

   Blackhole       0.71      0.97      0.82        69
    Flooding       1.00      1.00      1.00        78
    Grayhole       0.99      0.71      0.83        94
      Normal       0.94      0.98      0.96        83
        TDMA       0.96      0.93      0.95        76

    accuracy                         

In [40]:
original_df = pd.read_csv("WSN-DS.csv")
original_df.columns = original_df.columns.str.strip()
augmented_with_LLM_df = pd.read_csv("augmented_with_LLM.csv")
augmented_with_LLM_df.columns = augmented_with_LLM_df.columns.str.strip()
augmented_with_smote_df = pd.read_csv("augmented_with_smote_only.csv")
augmented_with_smote_df.columns = augmented_with_smote_df.columns.str.strip()

In [48]:
from scipy.stats import ks_2samp
from sklearn.metrics import pairwise_distances
import numpy as np

def feature_distribution_comparison(original_data, 
                                    augmented_data, 
                                    attack_types:list = ['Grayhole', 'Blackhole', 'TDMA', 'Flooding']):
    """
    이 함수는 원본 데이터와 증강 데이터 간의 K-S Test를 수행하여 
    평균, 최대값, 비율 기반의 품질 지표를 반환합니다.
    
    Parameters:
    original_data (DataFrame): 원본 데이터
    augmented_data (DataFrame): 증강된 데이터
    attack_types (list): 공격 유형 목록 (예: ['Grayhole', 'Blackhole', 'TDMA', 'Flooding'])

    Returns:
    DataFrame: K-S Test의 각 피처의 D-statistic과 p-value, 평균, 최대값 및 비율 기반 지표
    """
    # 전체 결과를 저장할 리스트
    overall_results = []

    for attack_type in attack_types:
        # 특정 공격 유형에 해당하는 데이터 필터링
        original_data_attack_type = original_data[original_data['Attack type'] == attack_type]
        augmented_data_attack_type = augmented_data[augmented_data['Attack type'] == attack_type]

        # 피처별로 K-S Test 수행
        results = []
        for col in original_data_attack_type.columns:
            if col == 'Attack type':  # 'Attack type' 컬럼은 제외
                continue

            # K-S Test 수행
            try:
                statistic, p_value = ks_2samp(original_data_attack_type[col], augmented_data_attack_type[col])
            except Exception as e:
                print(f"Error with feature {col}: {e}")
                statistic, p_value = np.nan, np.nan

            # 개별 피처의 결과 기록
            results.append({
                'Attack Type': attack_type,
                'Feature': col,
                'K-S Statistic (D)': statistic,
                'p-value': p_value
            })

        # 데이터프레임으로 변환
        df_results = pd.DataFrame(results)

        # 품질 지표 계산
        mean_d_statistic = df_results['K-S Statistic (D)'].mean()
        max_d_statistic = df_results['K-S Statistic (D)'].max()
        d_statistic_above_0_2_ratio = (df_results['K-S Statistic (D)'] >= 0.2).mean()

        mean_p_value = df_results['p-value'].mean()
        p_value_below_0_05_ratio = (df_results['p-value'] < 0.05).mean()

        # 품질 지표를 추가
        quality_metrics = {
            'Attack Type': attack_type,
            'Mean D-statistic': mean_d_statistic,
            'Max D-statistic': max_d_statistic,
            'D-statistic >= 0.2 Ratio': d_statistic_above_0_2_ratio,
            'Mean p-value': mean_p_value,
            'p-value < 0.05 Ratio': p_value_below_0_05_ratio
        }

        # 개별 피처의 결과와 품질 지표를 결합
        # overall_results.extend(results)  # 개별 피처의 K-S 결과 추가
        overall_results.append(quality_metrics)  # 품질 지표 추가

    # 전체 결과를 데이터프레임으로 반환
    final_results_df = pd.DataFrame(overall_results)
    
    return final_results_df

# distances = pairwise_distances(original_data, augmented_data, metric='euclidean')
# print('평균 쌍별 거리:', distances.mean())

# from sklearn.metrics import classification_report
# # 모델 학습 및 평가
# model.fit(augmented_data, augmented_labels)
# predictions = model.predict(test_data)
# print(classification_report(test_labels, predictions))


In [49]:
feature_distribution_comparison(original_df, augmented_with_LLM_df)

Unnamed: 0,Attack Type,Mean D-statistic,Max D-statistic,D-statistic >= 0.2 Ratio,Mean p-value,p-value < 0.05 Ratio
0,Grayhole,0.178079,0.600166,0.444444,0.44449,0.555556
1,Blackhole,0.119453,0.426772,0.388889,0.595697,0.388889
2,TDMA,0.20931,0.514679,0.388889,0.242939,0.5
3,Flooding,0.190569,0.658454,0.444444,0.389701,0.611111


In [50]:
feature_distribution_comparison(original_df, augmented_with_smote_df)

Unnamed: 0,Attack Type,Mean D-statistic,Max D-statistic,D-statistic >= 0.2 Ratio,Mean p-value,p-value < 0.05 Ratio
0,Grayhole,0.050132,0.249834,0.055556,0.685321,0.222222
1,Blackhole,0.039884,0.143514,0.0,0.660857,0.166667
2,TDMA,0.097885,0.271068,0.055556,0.290737,0.444444
3,Flooding,0.057257,0.149831,0.0,0.497918,0.166667
