In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
gas_sensor_array_drift_at_different_concentrations = fetch_ucirepo(id=270) 
  
# data (as pandas dataframes) 
X = gas_sensor_array_drift_at_different_concentrations.data.features 
y = gas_sensor_array_drift_at_different_concentrations.data.targets 
  
# metadata 
print(gas_sensor_array_drift_at_different_concentrations.metadata) 
  
# variable information 
print(gas_sensor_array_drift_at_different_concentrations.variables) 


{'uci_id': 270, 'name': 'Gas Sensor Array Drift at Different Concentrations', 'repository_url': 'https://archive.ics.uci.edu/dataset/270/gas+sensor+array+drift+dataset+at+different+concentrations', 'data_url': 'https://archive.ics.uci.edu/static/public/270/data.csv', 'abstract': 'This archive contains 13910 measurements from 16 chemical sensors exposed to 6 different gases at various concentration levels.', 'area': 'Computer Science', 'tasks': ['Classification', 'Regression', 'Clustering', 'Causa'], 'characteristics': ['Multivariate', 'Time-Series'], 'num_instances': 13910, 'num_features': 128, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2012, 'last_updated': 'Mon Apr 08 2024', 'dataset_doi': '10.24432/C5MK6M', 'creators': ['Alexander Vergara'], 'intro_paper': {'title': 'On the calibration of sensor arrays for pattern recognition using the minimal number

In [6]:
# X 데이터프레임의 각 열의 데이터 타입 확인
print(X.dtypes)

# 문자열이 포함된 열 확인
for column in X.columns:
    if X[column].dtype == 'object':
        print(f"Column '{column}' contains string data: {X[column].unique()[:5]}")

# 문자열이 포함된 열을 숫자로 변환하거나, 필요시 제거
for column in X.columns:
    if X[column].dtype == 'object':
        try:
            # 예: 문자열에서 숫자를 추출하는 방식
            X[column] = X[column].str.extract(r'(\d+\.?\d*)').astype(float)
        except ValueError:
            print(f"Could not convert column '{column}' to float, dropping column.")
            X = X.drop(columns=[column])

# 최종 확인
print(X.dtypes)


Feature1      float64
Feature2      float64
Feature3      float64
Feature4      float64
Feature5      float64
               ...   
Feature123    float64
Feature124    float64
Feature125    float64
Feature126    float64
Feature127    float64
Length: 127, dtype: object
Feature1      float64
Feature2      float64
Feature3      float64
Feature4      float64
Feature5      float64
               ...   
Feature123    float64
Feature124    float64
Feature125    float64
Feature126    float64
Feature127    float64
Length: 127, dtype: object


In [7]:
# 문자열 처리 개선 (예시: LabelEncoder)
from sklearn.preprocessing import LabelEncoder

for column in X.columns:
    if X[column].dtype == 'object':
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])


In [12]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

nan_cols = X.columns[X.isna().sum() == len(X)]
X = X.drop(nan_cols, axis=1)

# 데이터 스케일링
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)


# 시퀀스 생성 함수 정의
sequence_length = 10

def create_sequences(X, sequence_length):
    sequences = []
    for i in range(X.shape[0] - sequence_length):
        sequence = X[i:i + sequence_length]
        sequences.append(sequence)
    return np.array(sequences)

# 시퀀스 생성
X_seq = create_sequences(X_scaled, sequence_length)
y_seq = y[sequence_length:]

# 훈련, 검증, 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42, shuffle=False)

if y_train.ndim == 1:
    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)


In [27]:
import numpy as np
import re

# 숫자 라벨을 추출하는 함수 정의
def extract_numeric_label(label):
    # 바이트형 데이터라면 문자열로 디코딩
    if isinstance(label, bytes):
        label = label.decode('utf-8')
    
    # 콜론(:) 앞의 숫자를 추출
    match = re.match(r"(\d+):", label)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f"Unexpected label format: {label}")

# y_train과 y_test 배열에 함수 적용
y_train_cleaned = np.array([extract_numeric_label(label) for label in y_train])
y_test_cleaned = np.array([extract_numeric_label(label) for label in y_test])

# 라벨 인코딩 및 원-핫 인코딩 (필요시)
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_cleaned)
y_test_encoded = label_encoder.transform(y_test_cleaned)

y_train_one_hot = to_categorical(y_train_encoded)
y_test_one_hot = to_categorical(y_test_encoded)


TypeError: cannot use a string pattern on a bytes-like object

In [23]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, SimpleRNN, Dropout
from keras.optimizers import RMSprop
from keras.metrics import MeanSquaredError
from keras.callbacks import EarlyStopping


# LSTM 모델 정의
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer=RMSprop(), loss='mse', metrics=[MeanSquaredError()])
    return model

# GRU 모델 정의
def build_gru_model(input_shape):
    model = Sequential()
    model.add(GRU(50, activation='relu', input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer=RMSprop(), loss='mse', metrics=[MeanSquaredError()])
    return model

# SimpleRNN(tanh) 모델 정의
def build_rnn_model(input_shape):
    model = Sequential()
    model.add(SimpleRNN(50, activation='relu', input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer=RMSprop(), loss='mse', metrics=[MeanSquaredError()])
    return model

# 조기 종료 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 모델 훈련
lstm_model = build_lstm_model((X_train.shape[1], X_train.shape[2]))
gru_model = build_gru_model((X_train.shape[1], X_train.shape[2]))
rnn_model = build_rnn_model((X_train.shape[1], X_train.shape[2]))

lstm_history = lstm_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])
gru_history = gru_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])
rnn_history = rnn_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping])


ValueError: invalid literal for int() with base 10: '1:25902.322200'

In [None]:
# 4. 학습 및 검증 손실 시각화
plt.figure(figsize=(12, 8))

# Epochs 기준 그래프
plt.subplot(2, 2, 1)
plt.plot(rnn_history.history['loss'], label='tanh train', color='blue')
plt.plot(rnn_history.history['val_loss'], label='tanh valid', color='blue', linestyle='--')
plt.plot(gru_history.history['loss'], label='GRU train', color='green')
plt.plot(gru_history.history['val_loss'], label='GRU valid', color='green', linestyle='--')
plt.plot(lstm_history.history['loss'], label='LSTM train', color='purple')
plt.plot(lstm_history.history['val_loss'], label='LSTM valid', color='purple', linestyle='--')
plt.yscale('log')
plt.title('Per epoch')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# 시간 기준 그래프
plt.subplot(2, 2, 2)
plt.plot(np.cumsum(rnn_history.history['loss']), label='tanh train', color='blue')
plt.plot(np.cumsum(rnn_history.history['val_loss']), label='tanh valid', color='blue', linestyle='--')
plt.plot(np.cumsum(gru_history.history['loss']), label='GRU train', color='green')
plt.plot(np.cumsum(gru_history.history['val_loss']), label='GRU valid', color='green', linestyle='--')
plt.plot(np.cumsum(lstm_history.history['loss']), label='LSTM train', color='purple')
plt.plot(np.cumsum(lstm_history.history['val_loss']), label='LSTM valid', color='purple', linestyle='--')
plt.yscale('log')
plt.title('Wall Clock Time (seconds)')
plt.xlabel('Epochs')
plt.ylabel('Cumulative Loss')
plt.legend()

# 두 그래프를 합친 제목 추가
plt.suptitle('Gas Sensor Array Drift at Different Concentrations', fontsize=16)

plt.tight_layout(rect=[0, 0, 1, 0.95])  # suptitle과 그래프가 겹치지 않도록 조정
plt.show()