In [2]:
import keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#匯入訓練集
train_data = pd.read_csv('adult.data', header=None) # 如果資料檔案沒有列標題，則將 header 參數設為 None
#更改欄位名稱

train_data.columns = ['age', 'workclass','fnlwgt','education','education-num','marital-status',
                       'occupation','relationship','race','sex','capital-gain','capital-loss',
                      'hours-per-week','native-country','income']

#匯入測試集
test_data = pd.read_csv('adult.test', header=None,skiprows=1)  # 如果資料檔案沒有列標題，則將 header 參數設為 None
#更改欄位名稱

test_data.columns = ['age', 'workclass','fnlwgt','education','education-num','marital-status',
                      'occupation','relationship','race','sex','capital-gain','capital-loss',
                     'hours-per-week','native-country','income']

#處理缺失值

#訓練集
#已知有缺失值欄位為"workclass","occupation","native-country"
missing_columns = ['workclass', 'occupation', 'native-country']
#將"?"取代成NaN
train_data.replace(" ?", np.nan, inplace=True)
#計算名目資料的眾數
column_modes = train_data[missing_columns].mode().iloc[0]
#使用眾數填充缺失值
train_data.fillna(column_modes, inplace=True)

#測試集
#已知有缺失值欄位為"workclass","occupation","native-country"
missing_columns = ['workclass', 'occupation', 'native-country']
#將"?"取代成NaN
test_data.replace(" ?", np.nan, inplace=True)
#計算名目資料的眾數
column_modes = test_data[missing_columns].mode().iloc[0]
#使用眾數填充缺失值
test_data.fillna(column_modes, inplace=True)

In [3]:
#檢查缺失值
train_data_missing = train_data.isnull().sum()
test_data_missing = train_data.isnull().sum()
print(train_data_missing)
print("----------------")
print(test_data_missing)

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64
----------------
age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


In [4]:
#查看資料情況
train_data.info()
print("---------------------------------------------------")
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
---------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
Ran

In [5]:
#檢查兩個資料集的"income"欄位
unique_values = train_data['income'].unique()
print(unique_values)

unique_values = test_data['income'].unique()
print(unique_values)

[' <=50K' ' >50K']
[' <=50K.' ' >50K.']


In [6]:
#把測試集income欄位的"."刪除
test_data['income'] = test_data['income'].str.replace('.', '')
#檢查是否移除
unique_values = test_data['income'].unique()
print(unique_values)

[' <=50K' ' >50K']


  test_data['income'] = test_data['income'].str.replace('.', '')


In [7]:
from sklearn.preprocessing import LabelEncoder

# 訓練集進行 Label Encoding 的列名
columns_to_encode = ['sex', 'income','workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

# 建立 LabelEncoder 對象
label_encoder = LabelEncoder()

# 對指定欄位進行 Label Encoding，並將結果取代原始數據
encoded_data = train_data.copy() # 複製原始數據，避免修改原始數據
for column in columns_to_encode:
     # 對目前列進行 Label Encoding
     encoded_column = label_encoder.fit_transform(train_data[column])
     # 將 Label 編碼後的結果取代原始資料中的對應列
     encoded_data[column] = encoded_column

# 替換原始數據
train_data = encoded_data

from sklearn.preprocessing import LabelEncoder

# 測試集進行 Label Encoding 的列名
columns_to_encode = ['sex', 'income','workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

# 建立 LabelEncoder 對象
label_encoder = LabelEncoder()

# 對指定欄位進行 Label Encoding，並將結果取代原始數據
encoded_data = test_data.copy() # 複製原始數據，避免修改原始數據
for column in columns_to_encode:
     # 對目前列進行 Label Encoding
     encoded_column = label_encoder.fit_transform(test_data[column])
     # 將 Label 編碼後的結果取代原始資料中的對應列
     encoded_data[column] = encoded_column

# 替換原始數據
test_data = encoded_data

In [8]:
#查看資料情況，確認資料類型轉換完成
train_data.info()
print("---------------------------------------------------")
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32561 non-null  int64
 1   workclass       32561 non-null  int64
 2   fnlwgt          32561 non-null  int64
 3   education       32561 non-null  int64
 4   education-num   32561 non-null  int64
 5   marital-status  32561 non-null  int64
 6   occupation      32561 non-null  int64
 7   relationship    32561 non-null  int64
 8   race            32561 non-null  int64
 9   sex             32561 non-null  int64
 10  capital-gain    32561 non-null  int64
 11  capital-loss    32561 non-null  int64
 12  hours-per-week  32561 non-null  int64
 13  native-country  32561 non-null  int64
 14  income          32561 non-null  int64
dtypes: int64(15)
memory usage: 3.7 MB
---------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16281 entries, 0 to

In [11]:
# 使用Z-Score正規化
from sklearn.preprocessing import StandardScaler

# 建立標準化器對象
scaler = StandardScaler()

# 對訓練集進行標準化
train_data_normalized_array = scaler.fit_transform(train_data)
train_data_normalized = pd.DataFrame(train_data_normalized_array, columns=train_data.columns)

# 對測試集進行標準化
test_data_normalized_array = scaler.transform(test_data)
test_data_normalized = pd.DataFrame(test_data_normalized_array, columns=test_data.columns)


In [None]:
#移除無關欄位(fnlwgt、capital-gain、capital-loss)
train_data.drop('fnlwgt', axis = 1, inplace = True)
train_data.drop('capital-gain', axis = 1, inplace = True)
train_data.drop('capital-loss', axis = 1, inplace = True)

test_data.drop('fnlwgt', axis = 1, inplace = True)
test_data.drop('capital-gain', axis = 1, inplace = True)
test_data.drop('capital-loss', axis = 1, inplace = True)

# 切割特徵和目標
X_train = train_data.drop("hours-per-week", axis=1)  # 删除 "hours-per-week" 列，得到特徵
y_train = train_data["hours-per-week"]  # 目標為 "hours-per-week" 列
# 測試集
X_test = train_data.drop("hours-per-week", axis=1)  # 删除 "hours-per-week" 列，得到特徵
y_test = train_data["hours-per-week"]  # 目標為 "hours-per-week" 列

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from keras import layers
from keras.models import Sequential
from keras.regularizers import l2
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import time

# 建立模型
def build_regularized_model():
     model = Sequential([
         layers.Dense(128, activation="relu", kernel_regularizer=l2(0.001)),
         layers.Dense(128, activation="relu", kernel_regularizer=l2(0.001)),
         layers.Dense(1)
     ])
     model.compile(optimizer="rmsprop", loss="mse")
     return model

# 定義 mean_absolute_percentage_error 函數
def mean_absolute_percentage_error(y_true, y_pred):
    y_true = np.squeeze(y_true)
    y_pred = np.squeeze(y_pred)
    non_zero_indices = y_true != 0
    return np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100


# 準備資料集
X = X_train  # 特徵數據
y = y_train  # 目標數據

# 選擇 k 折交叉驗證策略
k = 5  # 5 折交叉驗證
kf = KFold(n_splits=k, shuffle=True)

# 執行交叉驗證
all_train_losses = []
all_train_mae = []
all_train_mape = []
all_train_rmse = []

all_test_losses = []
all_test_mae = []
all_test_mape = []
all_test_rmse = []

start_time = time.time()  # 計時開始
# 最終測試
for train_index, test_index in kf.split(X):
    # 建立模型
    model = build_regularized_model()

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # 訓練模型
    history = model.fit(X_train, y_train, epochs=10, batch_size=32  , verbose=0)

    # 評估模型表現 - 訓練集
    y_train_pred = model.predict(X_train)
    train_loss = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_mape = mean_absolute_percentage_error(y_train, y_train_pred)
    train_rmse = np.sqrt(train_loss)

    all_train_losses.append(train_loss)
    all_train_mae.append(train_mae)
    all_train_mape.append(train_mape)
    all_train_rmse.append(train_rmse)

    # 評估模型效能 - 測試集
    y_test_pred = model.predict(X_test)
    test_loss = mean_squared_error(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_loss)

    all_test_losses.append(test_loss)
    all_test_mae.append(test_mae)
    all_test_mape.append(test_mape)
    all_test_rmse.append(test_rmse)

# 計算訓練集和測試集上的平均表現指標
mean_train_loss = np.mean(all_train_losses)
mean_train_mae = np.mean(all_train_mae)
mean_train_mape = np.mean(all_train_mape)
mean_train_rmse = np.mean(all_train_rmse)

mean_test_loss = np.mean(all_test_losses)
mean_test_mae = np.mean(all_test_mae)
mean_test_mape = np.mean(all_test_mape)
mean_test_rmse = np.mean(all_test_rmse)

end_time = time.time()  # 計時結束
elapsed_time = end_time - start_time  # 經過時間
print("Total time elapsed:", elapsed_time, "seconds")

print("Mean Train Loss:", mean_train_loss)
print("Mean Train MAE:", mean_train_mae)
print("Mean Train MAPE:", mean_train_mape)
print("Mean Train RMSE:", mean_train_rmse)
print("----------------------------------")
print("Mean Test Loss:", mean_test_loss)
print("Mean Test MAE:", mean_test_mae)
print("Mean Test MAPE:", mean_test_mape)
print("Mean Test RMSE:", mean_test_rmse)


Total time elapsed: 97.68700242042542 seconds
Mean Train Loss: 124.45881832764562
Mean Train MAE: 7.788790547931647
Mean Train MAPE: 31.513162043338315
Mean Train RMSE: 11.152990995356442
----------------------------------
Mean Test Loss: 124.85290116151312
Mean Test MAE: 7.810329562320163
Mean Test MAPE: 31.67990101199331
Mean Test RMSE: 11.171006279628548


In [None]:
# 設定要列印的行數
num_rows_to_print = 10

# 使用訓練好的模型對測試集進行預測
predictions = model.predict(X_test)

# 列印實際值和預測值
print("實際值\t\t預測值")
for i in range(num_rows_to_print):
    print(f"{y_test.iloc[i]}\t{predictions[i][0]}")

實際值		預測值
40	42.69501495361328
45	45.95379638671875
40	46.025245666503906
50	41.65595245361328
45	31.109182357788086
20	34.52811813354492
60	37.730628967285156
80	42.799598693847656
52	40.377540588378906
40	40.52081298828125
