In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
air_quality = fetch_ucirepo(id=360) 
# metadata 
print(air_quality.metadata) 
# variable information 
print(air_quality.variables) 

{'uci_id': 360, 'name': 'Air Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/360/air+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/360/data.csv', 'abstract': 'Contains the responses of a gas multisensor device deployed on the field in an Italian city. Hourly responses averages are recorded along with gas concentrations references from a certified analyzer. ', 'area': 'Computer Science', 'tasks': ['Regression'], 'characteristics': ['Multivariate', 'Time-Series'], 'num_instances': 9358, 'num_features': 15, 'feature_types': ['Real'], 'demographics': [], 'target_col': None, 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2008, 'last_updated': 'Sun Mar 10 2024', 'dataset_doi': '10.24432/C59K5F', 'creators': ['Saverio Vito'], 'intro_paper': {'title': 'On field calibration of an electronic nose for benzene estimation in an urban pollution monitoring scenario', 'authors': 'S. D. Vito, E. Massera, M. P

In [2]:
y = air_quality.data.features['C6H6(GT)']
X = air_quality.data.features.drop(columns=['C6H6(GT)'])

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# 문자열 데이터가 포함된 열 확인
for column in X.columns:
    if X[column].dtype == 'object':
        print(f"Column {column} has string data: {X[column].unique()[:5]}")

# 예를 들어 특정 열이 여전히 문자열을 포함하고 있다면, 제거하거나 변환해야 합니다.
# 문자열 열을 드롭합니다.
X = X.select_dtypes(exclude=['object'])

# 모든 열이 수치형인지 다시 확인합니다.
print(X.dtypes)


# 데이터 스케일링
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 시계열 데이터로 변환하기 위해 시퀀스 생성
time_steps = 10

def create_sequences(X, y, time_steps):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps), :])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y, time_steps)

# 데이터셋을 훈련, 검증, 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42, shuffle=False)


Column Date has string data: ['3/10/2004' '3/11/2004' '3/12/2004' '3/13/2004' '3/14/2004']
Column Time has string data: ['18:00:00' '19:00:00' '20:00:00' '21:00:00' '22:00:00']
CO(GT)           float64
PT08.S1(CO)        int64
NMHC(GT)           int64
PT08.S2(NMHC)      int64
NOx(GT)            int64
PT08.S3(NOx)       int64
NO2(GT)            int64
PT08.S4(NO2)       int64
PT08.S5(O3)        int64
T                float64
RH               float64
AH               float64
dtype: object


In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, SimpleRNN, Dropout
from keras.optimizers import RMSprop
from keras.metrics import MeanSquaredError
from keras.callbacks import EarlyStopping

# LSTM 모델 정의
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, activation='tanh', input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    optimizer = RMSprop(learning_rate=0.001, clipnorm=1.0)  # Gradient clipping added
    model.compile(optimizer=RMSprop(), loss='mse', metrics=[MeanSquaredError()])
    return model

# GRU 모델 정의
def build_gru_model(input_shape):
    model = Sequential()
    model.add(GRU(50, activation='tanh', input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    optimizer = RMSprop(learning_rate=0.001, clipnorm=1.0)  # Gradient clipping added
    model.compile(optimizer=RMSprop(), loss='mse', metrics=[MeanSquaredError()])
    return model

# SimpleRNN(tanh) 모델 정의
def build_rnn_model(input_shape):
    model = Sequential()
    model.add(SimpleRNN(50, activation='tanh', input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    optimizer = RMSprop(learning_rate=0.001, clipnorm=1.0)  # Gradient clipping added
    model.compile(optimizer=RMSprop(), loss='mse', metrics=[MeanSquaredError()])
    return model

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 모델 훈련
lstm_model = build_lstm_model((X_train.shape[1], X_train.shape[2]))
gru_model = build_gru_model((X_train.shape[1], X_train.shape[2]))
rnn_model = build_rnn_model((X_train.shape[1], X_train.shape[2]))

lstm_history = lstm_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])
gru_history = gru_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])
rnn_history = rnn_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 예측 수행
y_pred_lstm = lstm_model.predict(X_test)
y_pred_gru = gru_model.predict(X_test)
y_pred_rnn = rnn_model.predict(X_test)

# MSE 계산
lstm_mse = mean_squared_error(y_test, y_pred_lstm)
gru_mse = mean_squared_error(y_test, y_pred_gru)
rnn_mse = mean_squared_error(y_test, y_pred_rnn)

# MAE 계산
lstm_mae = mean_absolute_error(y_test, y_pred_lstm)
gru_mae = mean_absolute_error(y_test, y_pred_gru)
rnn_mae = mean_absolute_error(y_test, y_pred_rnn)

# R^2 Score 계산
lstm_r2 = r2_score(y_test, y_pred_lstm)
gru_r2 = r2_score(y_test, y_pred_gru)
rnn_r2 = r2_score(y_test, y_pred_rnn)

print(f"LSTM MSE: {lstm_mse}, MAE: {lstm_mae}, R^2: {lstm_r2}")
print(f"GRU MSE: {gru_mse}, MAE: {gru_mae}, R^2: {gru_r2}")
print(f"RNN MSE: {rnn_mse}, MAE: {rnn_mae}, R^2: {rnn_r2}")


In [None]:
# 4. 학습 및 검증 손실 시각화

import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))

# Epochs 기준 그래프
plt.subplot(2, 2, 1)
plt.plot(rnn_history.history['loss'], label='tanh train', color='blue')
plt.plot(rnn_history.history['val_loss'], label='tanh valid', color='blue', linestyle='--')
plt.plot(gru_history.history['loss'], label='GRU train', color='green')
plt.plot(gru_history.history['val_loss'], label='GRU valid', color='green', linestyle='--')
plt.plot(lstm_history.history['loss'], label='LSTM train', color='purple')
plt.plot(lstm_history.history['val_loss'], label='LSTM valid', color='purple', linestyle='--')
plt.yscale('log')
plt.title('Per epoch')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# 시간 기준 그래프
plt.subplot(2, 2, 2)
plt.plot(np.cumsum(rnn_history.history['loss']), label='tanh train', color='blue')
plt.plot(np.cumsum(rnn_history.history['val_loss']), label='tanh valid', color='blue', linestyle='--')
plt.plot(np.cumsum(gru_history.history['loss']), label='GRU train', color='green')
plt.plot(np.cumsum(gru_history.history['val_loss']), label='GRU valid', color='green', linestyle='--')
plt.plot(np.cumsum(lstm_history.history['loss']), label='LSTM train', color='purple')
plt.plot(np.cumsum(lstm_history.history['val_loss']), label='LSTM valid', color='purple', linestyle='--')
plt.yscale('log')
plt.title('Wall Clock Time (seconds)')
plt.xlabel('Epochs')
plt.ylabel('Cumulative Loss')
plt.legend()

# 두 그래프를 합친 제목 추가
plt.suptitle('Air Quality [Optimize: RMSprop]', fontsize=16)

plt.tight_layout(rect=[0, 0, 1, 0.95])  # suptitle과 그래프가 겹치지 않도록 조정
plt.show()

In [None]:
# LSTM 모델 정의
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, activation='tanh', input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse', metrics=[MeanSquaredError()])
    return model

# GRU 모델 정의
def build_gru_model(input_shape):
    model = Sequential()
    model.add(GRU(50, activation='tanh', input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse', metrics=[MeanSquaredError()])
    return model

# SimpleRNN(tanh) 모델 정의
def build_rnn_model(input_shape):
    model = Sequential()
    model.add(SimpleRNN(50, activation='tanh', input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse', metrics=[MeanSquaredError()])
    return model

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 모델 훈련
lstm_model = build_lstm_model((X_train.shape[1], X_train.shape[2]))
gru_model = build_gru_model((X_train.shape[1], X_train.shape[2]))
rnn_model = build_rnn_model((X_train.shape[1], X_train.shape[2]))

lstm_history = lstm_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])
gru_history = gru_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])
rnn_history = rnn_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])


In [None]:
# 4. 학습 및 검증 손실 시각화

import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
# Epochs 기준 그래프
plt.subplot(2, 2, 1)
plt.plot(rnn_history.history['loss'], label='tanh train', color='blue')
plt.plot(rnn_history.history['val_loss'], label='tanh valid', color='blue', linestyle='--')
plt.plot(gru_history.history['loss'], label='GRU train', color='green')
plt.plot(gru_history.history['val_loss'], label='GRU valid', color='green', linestyle='--')
plt.plot(lstm_history.history['loss'], label='LSTM train', color='purple')
plt.plot(lstm_history.history['val_loss'], label='LSTM valid', color='purple', linestyle='--')
plt.yscale('log')
plt.title('Per epoch')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# 시간 기준 그래프
plt.subplot(2, 2, 2)
plt.plot(np.cumsum(rnn_history.history['loss']), label='tanh train', color='blue')
plt.plot(np.cumsum(rnn_history.history['val_loss']), label='tanh valid', color='blue', linestyle='--')
plt.plot(np.cumsum(gru_history.history['loss']), label='GRU train', color='green')
plt.plot(np.cumsum(gru_history.history['val_loss']), label='GRU valid', color='green', linestyle='--')
plt.plot(np.cumsum(lstm_history.history['loss']), label='LSTM train', color='purple')
plt.plot(np.cumsum(lstm_history.history['val_loss']), label='LSTM valid', color='purple', linestyle='--')
plt.yscale('log')
plt.title('Wall Clock Time (seconds)')
plt.xlabel('Epochs')
plt.ylabel('Cumulative Loss')
plt.legend()

# 두 그래프를 합친 제목 추가
plt.suptitle('Air Quality [Optimize: adam]', fontsize=16)

plt.tight_layout(rect=[0, 0, 1, 0.95])  # suptitle과 그래프가 겹치지 않도록 조정
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 예측 수행
y_pred_lstm = lstm_model.predict(X_test)
y_pred_gru = gru_model.predict(X_test)
y_pred_rnn = rnn_model.predict(X_test)

# MSE 계산
lstm_mse = mean_squared_error(y_test, y_pred_lstm)
gru_mse = mean_squared_error(y_test, y_pred_gru)
rnn_mse = mean_squared_error(y_test, y_pred_rnn)

# MAE 계산
lstm_mae = mean_absolute_error(y_test, y_pred_lstm)
gru_mae = mean_absolute_error(y_test, y_pred_gru)
rnn_mae = mean_absolute_error(y_test, y_pred_rnn)

# R^2 Score 계산
lstm_r2 = r2_score(y_test, y_pred_lstm)
gru_r2 = r2_score(y_test, y_pred_gru)
rnn_r2 = r2_score(y_test, y_pred_rnn)

print(f"LSTM MSE: {lstm_mse}, MAE: {lstm_mae}, R^2: {lstm_r2}")
print(f"GRU MSE: {gru_mse}, MAE: {gru_mae}, R^2: {gru_r2}")
print(f"RNN MSE: {rnn_mse}, MAE: {rnn_mae}, R^2: {rnn_r2}")
