In [1]:
#嘗試k-fold

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras import models
from keras import layers
import numpy as np
from keras.regularizers import l2
from sklearn.model_selection import KFold

# 匯入資料
columns_name = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','income']

data_train = pd.read_csv('adult/adult.data', names=columns_name, header=None , na_values=['?'])
d_train = pd.DataFrame(data_train)

data_test = pd.read_csv('adult/adult.test', names=columns_name, skiprows=1, header=None ,na_values=['?'])
d_test = pd.DataFrame(data_test)

# 處理遺失值
columns_to_check = ['workclass', 'occupation', 'native-country']
d_train.replace(" ?", np.nan, inplace=True)
column_modes = d_train[columns_to_check].mode().iloc[0]
d_train.fillna(column_modes, inplace=True)

d_test.replace(" ?", np.nan, inplace=True)
column_modes = d_test[columns_to_check].mode().iloc[0]
d_test.fillna(column_modes, inplace=True)

# 資料合併
d_data = pd.concat([d_train, d_test], axis=0)

# 移除無關欄位
d_data.drop(['fnlwgt', 'capital-gain', 'capital-loss'], axis=1, inplace=True)

# income類別調整
income_mapping = {
     " >50K": ">50K",
    " >50K.": ">50K",
    " <=50K": "<=50K",
    " <=50K.": "<=50K"
}
for column in ['income']:
    d_data[column] = d_data[column].replace(income_mapping)

# 類別型態轉成數值
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
d_str = d_data.select_dtypes(include='object')
for column in d_str.columns:
    d_data[column] = label_encoder.fit_transform(d_data[column])

# 切分資料集
d_train = d_data[:len(d_train)]
d_test = d_data[len(d_train):]

# 特徵、類別分離
x_train = d_train.drop('income', axis=1)
y_train = d_train['income']

x_test = d_test.drop('income', axis=1)
y_test = d_test['income']

# 正規化
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

print('ok')

# 定義自定義評估指標
import tensorflow as tf
from keras import backend as K

def precision(y_true, y_pred):
    true_positives = tf.reduce_sum(tf.round(tf.clip_by_value(tf.cast(y_true, tf.float32) * tf.cast(y_pred, tf.float32), 0, 1)))
    predicted_positives = tf.reduce_sum(tf.round(tf.clip_by_value(tf.cast(y_pred, tf.float32), 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = tf.reduce_sum(tf.round(tf.clip_by_value(tf.cast(y_true, tf.float32) * tf.cast(y_pred, tf.float32), 0, 1)))
    possible_positives = tf.reduce_sum(tf.round(tf.clip_by_value(tf.cast(y_true, tf.float32), 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def f1_score(y_true, y_pred):
    precision_val = precision(y_true, y_pred)
    recall_val = recall(y_true, y_pred)
    f1_val = 2 * ((precision_val * recall_val) / (precision_val + recall_val + K.epsilon()))
    return f1_val

# 定義神經網路模型
def create_model():
    model = models.Sequential()
    model.add(layers.Dense(500, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(layers.Dense(250, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(layers.Dense(125, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy', precision, recall, f1_score])
    return model

# 使用 k-fold 驗證
kfold = KFold(n_splits=5, shuffle=True)
cv_scores = []
train_metrics = None

for train_index, val_index in kfold.split(x_train):
    model = create_model()
    x_fold_train, x_fold_val = x_train[train_index], x_train[val_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[val_index]
    history = model.fit(x_fold_train, y_fold_train, epochs=8, batch_size=64, validation_data=(x_fold_val, y_fold_val))
    scores = model.evaluate(x_fold_val, y_fold_val)
    cv_scores.append(scores)
    # 計算訓練集的績效指標
    train_metrics = model.evaluate(x_fold_train, y_fold_train)

# 列印最後的訓練集績效指標
if train_metrics:
    print("\n訓練集績效")
    print("Loss:", train_metrics[0])
    print("Accuracy:", train_metrics[1])
    print("Precision:", train_metrics[2])
    print("Recall:", train_metrics[3])
    print("F1 Score:", train_metrics[4])


# 列印測試集績效指標
test_results = model.evaluate(x_test, y_test)
print('\n測試集績效')
print("Loss:", test_results[0])
print("Accuracy:", test_results[1])
print("Precision:", test_results[2]) 
print("Recall:", test_results[3])
print("F1 Score:", test_results[4])


ok
Epoch 1/8
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7776 - f1_score: 0.3038 - loss: 0.7380 - precision: 0.4474 - recall: 0.2797 - val_accuracy: 0.8154 - val_f1_score: 0.6237 - val_loss: 0.4423 - val_precision: 0.6101 - val_recall: 0.6499
Epoch 2/8
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8131 - f1_score: 0.5402 - loss: 0.4318 - precision: 0.6667 - recall: 0.4992 - val_accuracy: 0.8250 - val_f1_score: 0.6162 - val_loss: 0.4074 - val_precision: 0.6529 - val_recall: 0.5947
Epoch 3/8
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8210 - f1_score: 0.5769 - loss: 0.4106 - precision: 0.6902 - recall: 0.5378 - val_accuracy: 0.8262 - val_f1_score: 0.5898 - val_loss: 0.3958 - val_precision: 0.6799 - val_recall: 0.5333
Epoch 4/8
[1m407/407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8178 - f1_score: 0.5614 - loss: 0.4017 - pr

[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8150 - f1_score: 0.5389 - loss: 0.4342 - precision: 0.6812 - recall: 0.4943 - val_accuracy: 0.7945 - val_f1_score: 0.3033 - val_loss: 0.4791 - val_precision: 0.8104 - val_recall: 0.1931
Epoch 3/8
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8206 - f1_score: 0.5568 - loss: 0.4048 - precision: 0.6771 - recall: 0.5139 - val_accuracy: 0.7535 - val_f1_score: 0.6088 - val_loss: 0.4870 - val_precision: 0.4916 - val_recall: 0.8152
Epoch 4/8
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8223 - f1_score: 0.5745 - loss: 0.4002 - precision: 0.6761 - recall: 0.5369 - val_accuracy: 0.7652 - val_f1_score: 0.0586 - val_loss: 0.5940 - val_precision: 0.3660 - val_recall: 0.0323
Epoch 5/8
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8181 - f1_score: 0.5393 - loss: 0.4010 - precision: 0.66