<a href="https://colab.research.google.com/github/YMGYM/TSE_Learning/blob/master/Beijing_air_plollution_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

이 파일은 황철현, 신강욱의
`미세먼지 예측 성능 개선을 위한 CNN-LSTM 결합 방법`
논문의 구현 연습 파일입니다.

데이터셋은 [Beijing PM2.5 데이터셋](https://www.kaggle.com/djhavera/beijing-pm25-data-data-set)
을 사용했습니다.

# Import All
필요한 라이브러리들을 임포트합니다.

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as K
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Load Data
구글 드라이브를 마운트하고, 구글 드라이브에 있는 데이터를 임포트합니다.

In [3]:
! unzip /content/drive/My\ Drive/Datasets/beijing_air.zip -d data

Archive:  /content/drive/My Drive/Datasets/beijing_air.zip
  inflating: data/PRSA_data_2010.1.1-2014.12.31.csv  


In [4]:
def get_data():
  all_data = pd.read_csv('/content/data/PRSA_data_2010.1.1-2014.12.31.csv') # 전체 데이터
  dropped_data = all_data.drop(['No', 'year', 'month', 'day', 'hour'],axis=1) # 필요 없는 데이터는 버림
  pm25 = dropped_data.pop('pm2.5') # 미세먼지 데이터 확인
  pm25 = pm25.fillna(method='pad')
  
  return pm25, dropped_data

# Define Scalers
데이터를 정규화할 스케일러를 구현합니다.

In [115]:
class Scaler():
  def __init__(self):
    self.scaler = MinMaxScaler()
    self.last_scaled_data = None

  def data_normalize(self, data = None):
    if isinstance(data, type(np.array([]))) == False:
      reshaped = data.to_numpy().reshape(-1,1) # x 가 pandas 데이터인 경우 numpy로 변환 후 reshape 한다.
    else:
      reshaped = data.reshape(-1,1)

    self.last_scaled_data = self.scaler.fit_transform(reshaped)

    return self.last_scaled_data

  def invert_scale(self, data):
    # 정규화된 데이터를 원상태로 돌립니다.
    return self.scaler.inverse_transform(data)

  def slice_data(self, data, rate=0.1): # 데이터를 정해진 비율로 나눕니다.
    arrlen = int(len(data) * rate)
    train, val, test = data[:-1 * (arrlen * 2)], data[-1 * (arrlen * 2) : -1 * (arrlen)], data[-1 * (arrlen):]
    return train, val, test

## Pm2.5 Data Scaler
미세먼지 데이터를 위한 Scaler 구현입니다. 기능은 동일합니다.

In [144]:
class PmScaler(Scaler):
  def __init__(self, pm25, data_len = 2):
    super().__init__()

## Proxy Data Scaler
주위 환경 정보용 Scaler 구현입니다.

In [230]:
class ProxyScaler(Scaler):
  def __init__(self, data):
    super().__init__()
    # self.table = data
  
  def change_cbwd_data(self, table): # 문자열로 된 풍향 정보를 숫자로 변환합니다.
    mapping = {}
    cols = table["cbwd"].value_counts().index
    for i, col in enumerate(cols):
      mapping[col] = i
    table = table.replace({'cbwd' : mapping})
    print("cbwd data changed to number : {SE : 0, NW: 1, cv: 2, NE:3} ")

    return table

  def data_normalize(self, data):
    norm_data = self.scaler.fit_transform(data)
    return norm_data
    

# Make Custom Layer
CNN 계층의 출력을 비율로 변환해주는 커스텀 계층입니다.

In [194]:
class DataCombine(K.layers.Layer):
  def __init__(self, data, bins = None, **kwargs):
    super().__init__(**kwargs)

    if bins == None:
      self.bins = [-9.166667e-02, -1e-15,1e-15, 1.212121e-01]
    else:
      self.bins = bins

    self.grad_data = pm25.pct_change().fillna(method="pad")
    
  def call(self, inputs):
    # 예측량을 구함
    rate = self._get_rate(data)
    return rate * value.squeeze(axis=1)

  def _get_rate(self, data):
    # 변화율 별로 pm25의 예측량을 구해봄(임시)
    index = data.argmax(axis=1)
    rate = np.array([])
    for i in range(len(index)):
      if index[i] == 3:
        rate = np.append(rate, 1)
      else:
        rate = np.append(rate, 1 + ((index[i]-3) * 0.25)) # 최대 50%의 변화율을 줘 봄
    return rate

  def _get_grad_pm25(self):
    # pm25 의 변화율을 구하고 범주화함
    grad_data = self.pm25.pct_change()
    grad_data = grad_data.fillna(method="pad")
    bins = [-9.166667e-02, -1e-15,1e-15, 1.212121e-01]
    # bins = [-10, -1e-15, 1e-15, 10]
    grad_level = np.digitize(grad_data, bins=bins, right=False)
    return grad_level


# Custom Generator
데이터 제공을 위한 커스텀 제너레이터 구현

In [470]:
class CustomGenerator(K.utils.Sequence):
  def __init__(self, cnn_gen, lstm_gen):
    self.cnn_gen = cnn_gen
    self.lstm_gen = lstm_gen

  def __getitem__(self, idx):
    cnn_x, cnn_y = self.cnn_gen[idx]
    lstm_x, lstm_y = self.lstm_gen[idx]
    
    return [cnn_x, lstm_x], lstm_y

  def __len__(self):
    return self.cnn_gen.__len__() #위험!

# Make Model Trainer
모델을 학습시켜주는 Trainer 클래스입니다.

In [471]:
class ModelTrainer():
  def __init__(self, pm25, proxy):
    # --------- 데이터 보관 --------
    self.pm25 = pm25.fillna(1e-8,limit=1).fillna(method="pad")
    self.proxy = proxy
    print(f"pm25 length : {len(self.pm25)}" )
    print(f"proxydata length : {len(self.proxy)}" )

    # --------- 스케일러 생성 --------
    self.pmScaler = PmScaler(self.pm25)
    self.proxyScaler = ProxyScaler(self.proxy)

    # ---------- train 용 callbacks -----------
    self.callbacks = [K.callbacks.TensorBoard(log_dir='model_logs')]


  def data_preprocess(self, rate = 0.1): # 데이터 전처리
    self.proxy = self.proxyScaler.change_cbwd_data(self.proxy)
    # -------- 데이터 정규화 ---------
    self.pm25 = self.pmScaler.data_normalize(self.pm25)
    self.proxy = self.proxyScaler.data_normalize(self.proxy)
    # -------- 데이터 분할 ----------
    self.cnn_train, self.cnn_val, self.cnn_test = self.proxyScaler.slice_data(data=self.proxy, rate = rate)
    self.lstm_train, self.lstm_val, self.lstm_test = self.pmScaler.slice_data(data=self.pm25, rate = rate)
    print(f"cnn_x_train : {len(self.cnn_train)}, cnn_x_val : {len(self.cnn_val)}, cnn_x_test length: {len(self.cnn_test)}")
    print(f"lstm_x_train : {len(self.lstm_train)}, lstm_x_val : {len(self.lstm_val)}, lstm_x_test length: {len(self.lstm_test)}")

  def make_datagenerator(self, lstm_data_len=15, cnn_data_len=2, batch_size = 1): # 데이터 제너레이터 생성

    self.cnn_train_data_gen = K.preprocessing.sequence.TimeseriesGenerator(self.cnn_train, self.cnn_train, length=cnn_data_len, batch_size = batch_size, shuffle=True)
    self.cnn_val_data_gen = K.preprocessing.sequence.TimeseriesGenerator(self.cnn_val, self.cnn_val, length=cnn_data_len, batch_size = batch_size, shuffle=True)
    self.cnn_test_data_gen = K.preprocessing.sequence.TimeseriesGenerator(self.cnn_test, self.cnn_test, length=cnn_data_len, batch_size = 1, shuffle=False)
    print(f"cnn_train_data_gen length : {self.cnn_train_data_gen.__len__()} // data_len: {cnn_data_len}" )
    print(f"cnn_val_data_gen length : {self.cnn_val_data_gen.__len__()} // data_len: {cnn_data_len}" )
    print(f"cnn_test_data_gen length : {self.cnn_test_data_gen.__len__()} // data_len: {cnn_data_len}" )

    self.lstm_train_data_gen = K.preprocessing.sequence.TimeseriesGenerator(self.lstm_train, self.lstm_train, length=lstm_data_len, batch_size = batch_size, shuffle=True)
    self.lstm_val_data_gen = K.preprocessing.sequence.TimeseriesGenerator(self.lstm_val, self.lstm_val, length=lstm_data_len, batch_size = batch_size, shuffle=True)
    self.lstm_test_data_gen = K.preprocessing.sequence.TimeseriesGenerator(self.lstm_test, self.lstm_test, length=lstm_data_len, batch_size = 1, shuffle=False)
    print(f"lstm_train_data_gen length : {self.lstm_train_data_gen.__len__()} // data_len: {lstm_data_len}" )
    print(f"lstm_val_data_gen length : {self.lstm_val_data_gen.__len__()} // data_len: {lstm_data_len}" )
    print(f"lstm_test_data_gen length : {self.lstm_test_data_gen.__len__()} // data_len: {lstm_data_len}" )

    self.total_train_data_gen = CustomGenerator(self.cnn_train_data_gen, self.lstm_train_data_gen)
    self.total_val_data_gen = CustomGenerator(self.cnn_val_data_gen, self.lstm_val_data_gen)
    self.total_test_data_gen = CustomGenerator(self.cnn_test_data_gen, self.lstm_test_data_gen)
    print("total data generator generated")

  def _make_generator(self, cnn_gen, lstm_gen): #모델 학습을 위한 더미 제너레이터 생성
    return CustomGenerator(cnn_gen, lstm_gen)
    
  def generate_model(self): # 모델을 생성함
    cnn_x, cnn_y = self.cnn_train_data_gen[0]
    lstm_x, lstm_y = self.lstm_train_data_gen[0]

    cnn_input = K.layers.Input(shape=(1, cnn_x.shape[1], cnn_x.shape[2])) # (차원 수, 행, 열)
    lstm_input = K.layers.Input(shape=(lstm_x.shape[0], lstm_x.shape[1])) # (시간 수, 1)
    cnn1 = K.layers.Conv2DTranspose(32, (2,2), activation="relu")(cnn_input)
    maxpool1 = K.layers.MaxPool2D(strides=2)(cnn1)
    flatten = K.layers.Flatten()(maxpool1)
    dropout1 = K.layers.Dropout(0.1)(flatten)
    dense1 = K.layers.Dense(100, activation="relu")(dropout1)
    dense2 = K.layers.Dense(1, activation="sigmoid")(dense1)

    reshape = K.layers.Reshape(target_shape=(1,-1))(dense2)
    concat = K.layers.Concatenate()([lstm_input, reshape])

    lstm1 = K.layers.LSTM(216)(concat)
    dropout2 = K.layers.Dropout(0.3)(lstm1)
    dense3 = K.layers.Dense(128, activation='relu')(dropout2)
    dropout3 = K.layers.Dropout(0.3)(dense3)
    dense4 = K.layers.Dense(1, activation="sigmoid")(dropout3)

    self.model = K.models.Model(inputs=[cnn_input,lstm_input] , outputs=dense4)
    self.model.summary()

    self.model.compile(loss="RMSE", optimizer = "adam")
    return self.model

  def fit(self, epochs=1): # 모델 학습
    self.model.fit(x = self.total_train_data_gen, epochs=epochs, shuffle=True, callbacks=self.callbacks)


In [472]:
pm25, proxy = get_data()

In [473]:
trainer = ModelTrainer(pm25[:1000], proxy[:1000])

pm25 length : 1000
proxydata length : 1000


In [474]:
trainer.data_preprocess(rate=0.1)

cbwd data changed to number : {SE : 0, NW: 1, cv: 2, NE:3} 
cnn_x_train : 800, cnn_x_val : 100, cnn_x_test length: 100
lstm_x_train : 800, lstm_x_val : 100, lstm_x_test length: 100


In [475]:
trainer.make_datagenerator(lstm_data_len = 15, cnn_data_len=15, batch_size = 1)

cnn_train_data_gen length : 785 // data_len: 15
cnn_val_data_gen length : 85 // data_len: 15
cnn_test_data_gen length : 85 // data_len: 15
lstm_train_data_gen length : 785 // data_len: 15
lstm_val_data_gen length : 85 // data_len: 15
lstm_test_data_gen length : 85 // data_len: 15
total data generator generated


In [476]:
model = trainer.generate_model()

Model: "functional_42"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_59 (InputLayer)           [(None, 1, 15, 7)]   0                                            
__________________________________________________________________________________________________
conv2d_transpose_29 (Conv2DTran (None, 2, 16, 32)    928         input_59[0][0]                   
__________________________________________________________________________________________________
max_pooling2d_28 (MaxPooling2D) (None, 1, 8, 32)     0           conv2d_transpose_29[0][0]        
__________________________________________________________________________________________________
flatten_28 (Flatten)            (None, 256)          0           max_pooling2d_28[0][0]           
______________________________________________________________________________________

In [477]:
trainer.fit(epochs=1)



ValueError: ignored

In [481]:
trainer.cnn_train_data_gen[9999999]

(array([[[0.08      , 0.7037037 , 0.51851852, 0.        , 0.30384113,
          0.        , 0.        ],
         [0.2       , 0.7037037 , 0.55555556, 0.        , 0.33079937,
          0.        , 0.        ],
         [0.28      , 0.66666667, 0.55555556, 0.        , 0.34426175,
          0.        , 0.        ],
         [0.24      , 0.66666667, 0.55555556, 0.        , 0.35025619,
          0.        , 0.        ],
         [0.24      , 0.55555556, 0.59259259, 0.33333333, 0.00147349,
          0.        , 0.        ],
         [0.24      , 0.62962963, 0.55555556, 0.        , 0.01195539,
          0.        , 0.        ],
         [0.2       , 0.62962963, 0.55555556, 0.        , 0.02243729,
          0.        , 0.        ],
         [0.2       , 0.66666667, 0.55555556, 0.        , 0.03891363,
          0.        , 0.        ],
         [0.24      , 0.55555556, 0.55555556, 0.        , 0.04939553,
          0.        , 0.        ],
         [0.24      , 0.55555556, 0.51851852, 0.6666666