ocsvm


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, accuracy_score

# 載入數據
file_path = '/Users/linyinghsiao/Desktop/chatgpt_output拷貝.csv'  # 替換為您的文件路徑
data = pd.read_csv(file_path)

# 處理缺失值
num_imputer = SimpleImputer(strategy='median')
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

# 編碼類別型變量
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# 標準化特徵
scaler = StandardScaler()
X = data.drop('label', axis=1)
X_scaled = scaler.fit_transform(X)

# 分割數據為訓練集和測試集
normal_data = data[data['label'] == 0]
anomalous_data = data[data['label'] == 1]

train_normal = normal_data.sample(frac=0.8, random_state=42)
test_normal = normal_data.drop(train_normal.index)
test_data = pd.concat([test_normal, anomalous_data], axis=0)

X_train = scaler.transform(train_normal.drop('label', axis=1))
X_test = scaler.transform(test_data.drop('label', axis=1))
y_test = test_data['label']

# 訓練 One-Class SVM 模型
ocsvm_model = OneClassSVM(kernel='rbf', gamma='auto')
ocsvm_model.fit(X_train)

# 在測試集上進行預測
y_pred_test = ocsvm_model.predict(X_test)
y_pred_test = (y_pred_test == -1).astype(int)

# 評估模型
test_accuracy = accuracy_score(y_test, y_pred_test)
test_report = classification_report(y_test, y_pred_test)

print(f"Accuracy: {test_accuracy}")
print(f"Classification Report: \n{test_report}")


Accuracy: 0.4995909016527573
Classification Report: 
              precision    recall  f1-score   support

         0.0       1.00      0.49      0.66      6000
         1.0       0.03      0.91      0.06       111

    accuracy                           0.50      6111
   macro avg       0.51      0.70      0.36      6111
weighted avg       0.98      0.50      0.65      6111



isolation forest

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, f1_score

# 載入數據
file_path = '/Users/linyinghsiao/Desktop/chatgpt_output拷貝.csv'  # 替換為您的文件路徑
data = pd.read_csv(file_path)

# 處理缺失值
num_imputer = SimpleImputer(strategy='median')
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
categorical_cols = data.select_dtypes(include=['object']).columns
data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])

# 編碼類別型變量
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# 標準化特徵
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data.drop('label', axis=1))

# 創建並訓練 Isolation Forest 模型
iso_forest_model = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
iso_forest_model.fit(X_scaled)

# 在數據集上進行預測
y_pred_iso = iso_forest_model.predict(X_scaled)
y_pred_iso = (y_pred_iso == -1).astype(int)

# 評估模型
iso_forest_accuracy = accuracy_score(data['label'], y_pred_iso)
iso_forest_f1 = f1_score(data['label'], y_pred_iso)

print(f"Accuracy: {iso_forest_accuracy}")
print(f"F1 Score: {iso_forest_f1}")


Accuracy: 0.9131878715419615
F1 Score: 0.04598540145985402


autoencoder

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from keras.models import Model
from keras.layers import Input, Dense

# 載入數據
file_path = '/Users/linyinghsiao/Desktop/chatgpt_output拷貝.csv'  # 替換為您的文件路徑
data = pd.read_csv(file_path)

# 處理缺失值
num_imputer = SimpleImputer(strategy='median')
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
data[numerical_cols] = num_imputer.fit_transform(data[numerical_cols])

# 處理非數值型列
label_encoder = LabelEncoder()
non_numeric_columns = data.select_dtypes(include=['object']).columns
for col in non_numeric_columns:
    data[col] = label_encoder.fit_transform(data[col])

# 標準化特徵
X_scaled = StandardScaler().fit_transform(data.drop('label', axis=1))

# 創建 Autoencoder 模型
input_dim = X_scaled.shape[1]
input_layer = Input(shape=(input_dim,))
encoder = Dense(16, activation="relu")(input_layer)
encoder = Dense(8, activation="relu")(encoder)
decoder = Dense(16, activation="relu")(encoder)
decoder = Dense(input_dim, activation="linear")(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

# 編譯和訓練模型
autoencoder.compile(optimizer='adam', loss='mean_squared_error')
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# 使用模型對數據進行預測，並計算重建誤差
predictions = autoencoder.predict(X_scaled)
mse = np.mean(np.power(X_scaled - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse, 'true_class': data['label']})

# 根據重建誤差識別異常


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [7]:
error_df

Unnamed: 0,reconstruction_error,true_class
0,0.034607,0.0
1,0.120706,0.0
2,0.088149,0.0
3,0.095376,0.0
4,0.130722,0.0
...,...,...
30106,0.581031,1.0
30107,0.240890,1.0
30108,0.753817,1.0
30109,0.108775,1.0


In [8]:
error_df.to_csv('../datasets/dataset_1st/result_output.csv', index=False)

In [12]:


# Display the first few rows of the dataframe and summary statistics
error_df_head = error_df.head()
error_df_description = error_df['reconstruction_error'].describe()

error_df_head, error_df_description


(   reconstruction_error  true_class
 0              0.034607         0.0
 1              0.120706         0.0
 2              0.088149         0.0
 3              0.095376         0.0
 4              0.130722         0.0,
 count    30111.000000
 mean         0.268218
 std          1.680325
 min          0.011050
 25%          0.099620
 50%          0.161950
 75%          0.265251
 max        265.903235
 Name: reconstruction_error, dtype: float64)

In [13]:
# Calculating the threshold based on mean and standard deviation
#根據所選的閾值策略（平均重建誤差加上兩倍標準差），我們確定了以下結果：
#計算出的閾值為約 3.63。
#根據此閾值，被識別為異常的樣本數量為 250。
#被識別為正常的樣本數量為 29,861。
mean_error = error_df_description['mean']
std_error = error_df_description['std']
threshold = mean_error + 2 * std_error

# Identifying anomalies
error_df['anomaly'] = error_df['reconstruction_error'] > threshold

# Count of anomalies and normal data points
anomaly_count = error_df['anomaly'].sum()
normal_count = len(error_df) - anomaly_count

threshold, anomaly_count, normal_count


(3.628867371909841, 250, 29861)

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Extracting the true labels and predicted anomaly labels
y_true = error_df['true_class']
y_pred_anomaly = error_df['anomaly']

# Calculating evaluation metrics
accuracy = accuracy_score(y_true, y_pred_anomaly)
precision = precision_score(y_true, y_pred_anomaly)
recall = recall_score(y_true, y_pred_anomaly)
f1 = f1_score(y_true, y_pred_anomaly)
conf_matrix = confusion_matrix(y_true, y_pred_anomaly)

accuracy, precision, recall, f1, conf_matrix


(0.9884759722360599,
 0.028,
 0.06306306306306306,
 0.038781163434903045,
 array([[29757,   243],
        [  104,     7]]))