<a href="https://colab.research.google.com/github/YMGYM/TSE_Learning/blob/master/Beijing_air_plollution_2(final).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

이 파일은 황철현, 신강욱의
`미세먼지 예측 성능 개선을 위한 CNN-LSTM 결합 방법`
논문의 구현 연습 파일입니다.

데이터셋은 [Beijing PM2.5 데이터셋](https://www.kaggle.com/djhavera/beijing-pm25-data-data-set)
을 사용했습니다.

# Import All

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as K
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Load Data

In [None]:
! unzip /content/drive/My\ Drive/Datasets/beijing_air.zip -d data

Archive:  /content/drive/My Drive/Datasets/beijing_air.zip
  inflating: data/PRSA_data_2010.1.1-2014.12.31.csv  


In [None]:
def get_data():
  all_data = pd.read_csv('/content/data/PRSA_data_2010.1.1-2014.12.31.csv') # 전체 데이터
  dropped_data = all_data.drop(['No', 'year', 'month', 'day', 'hour'],axis=1) # 필요 없는 데이터는 버림
  pm25 = dropped_data.pop('pm2.5') # 미세먼지 데이터 확인

  return pm25, dropped_data

In [None]:
pm25, proxy = get_data()

# NaN Data fix

In [None]:
pm25 = pm25.fillna(method='pad')

# Make Normalize Dataset

In [None]:
class PmScaler:
  def __init__(self):
    self.scaler = MinMaxScaler()
  
  def make_norlized_dataset(self, x, rate): 
    arrlen = int(len(x) * (rate))
    if isinstance(x, type(np.array([]))) == False:
      reshaped = x.to_numpy().reshape(-1,1)
    else:
      reshaped = x.reshape(-1,1)
    scaled_data = self.scaler.fit_transform(reshaped)

    train, val, test = scaled_data[:-1 * (arrlen * 2)], scaled_data[-1 * (arrlen * 2) : -1 * (arrlen)], scaled_data[-1 * (arrlen):]

    return train, val, test
  
  def invert_scale(self, x):
    inverse = self.scaler.inverse_transform(x)
    return inverse

In [None]:
scaler = PmScaler()
lstm_x_train, lstm_x_val, lstm_x_test = scaler.make_norlized_dataset(pm25, 0.1)

In [None]:
class ProxyDataScaler():
  def __init__(self, data):
    self.table = data
    self.scaler = MinMaxScaler()
    
  def change_cbwd_data(self):
    mapping = {}
    cols = self.table["cbwd"].value_counts().index

    for i, col in enumerate(cols):
      mapping[col] = i # mapping = {"SE" : 0, "NW": 1, "cv": 2, "NE":3}
    self.table = self.table.replace({'cbwd' : mapping})
    print("cbwd data changed to number : {SE : 0, NW: 1, cv: 2, NE:3} ")

  def make_normalize_data(self):
    self.norm_data = self.scaler.fit_transform(self.table)
    return self.norm_data

  def slice_proxy_data(self, time):
    col_cnt = len(self.table.columns)
    if (self.norm_data is not None): # 정규화된 데이터가 있는지 확인
      print("norm_data detected")

      if isinstance(self.norm_data, type(np.array([]))) == False: # 데이터를 numpy 형식으로 변환
        data = self.norm_data.to_numpy().astype("float32")
      else:
        data = self.norm_data.astype("float32")

    else:
      print("norm_data not detected")
      data = self.table.to_numpy().astype("float32")

    self.sliced_data = np.zeros(shape=(1,time,col_cnt))

    for i in range((len(data)-time) + 1):
      if i == 0:
        self.sliced_data = data[:i+time].reshape(1, time,-1)
      else:
        self.sliced_data = np.vstack((self.sliced_data, data[i:i+time].reshape(1,time,-1)))
    # self.sliced_data = self.sliced_data.transpose(0,2,1)
    return self.sliced_data

  def split_data(self, data = None, rate = 0.1):
    if data is None:
      arrlen = int(len(self.sliced_data) * (rate))
    else:
      arrlen = int(len(data) * rate)

    data = self.sliced_data
    train, val, test = data[:-1 * (arrlen * 2)], data[-1 * (arrlen * 2) : -1 * (arrlen)], data[-1 * (arrlen):]

    return train, val, test

# New LSTM Data generator

In [None]:
class LSTMInputGenerator(K.utils.Sequence):
  def __init__(self, lstm_x, lstm_y, data_len, cnn_output):
    self.lstm_data_gen = K.preprocessing.sequence.TimeseriesGenerator(lstm_x, lstm_x, batch_size=1, length=data_len, shuffle=False)
    self.cnn_output = cnn_output
  def __getitem__(self, index):
    lstm_x, lstm_y = self.lstm_data_gen[index]
    stack_data = self.cnn_output[index].reshape(1,-1,1)
    return_x = np.hstack((lstm_x, stack_data))
    return return_x, lstm_y
    
  def __len__(self):
    return self.lstm_data_gen.__len__()

# Entire Model

In [None]:
class EntireModel():
  def __init__(self, pm25, proxydata):
    # ----------- 모델 생성 --------------
    self.cnn_model = _get_cnn_model()
    self.lstm_model = _get_lstm_model()

    # -----------scaler 클래스 생성 -------------------
    self.pm25 = pm25
    self.proxydata = proxydata
    self.pmScaler = pmScaler()
    self.proxyScaler = proxyScaler(proxydata)
    
    # ---------- train 용 callbacks -----------
    self.lstm_callbacks = [K.callbacks.TensorBoard(log_dir='lstm_logs')]
    self.cnn_callbacks = [K.callbacks.TensorBoard(log_dir='cnn_logs')]

  def make_proxy_data_generator(self, data_len = 2):
    # ---------- proxy 데이터 전처리 -----------
    self.proxyScaler.change_cbwd_data()
    self.proxyScaler.make_normalize_data()
    self.proxyScaler.make_normalize_data()
    self.proxyScaler.slice_proxy_data(data_len)
    cnn_x_train, cnn_x_val, cnn_x_test = self.proxyScaler.split_data()
    cnn_y_train, cnn_y_val, cnn_y_test = _cnn_y_split(self.categorical_grad, data_len)

    
    self.cnn_train_data_gen = K.preprocessing.sequence.TimeseriesGenerator(cnn_x_train, cnn_y_train, length=1, batch_size = 128, shuffle=True)
    self.cnn_val_data_gen = K.preprocessing.sequence.TimeseriesGenerator(cnn_x_val, cnn_y_val, length=1, batch_size = 128, shuffle=True)
    self.cnn_test_data_gen = K.preprocessing.sequence.TimeseriesGenerator(cnn_x_test, cnn_y_test, length=1, batch_size = 1, shuffle=False)

  def make_lstm_data_generator(self, cnn_output, data_len = 15):
    # ---------- pm25 데이터 전처리 -----------
    pm25_x_train, pm25_x_val, pm25_x_test = pmScaler.make_norlized_dataset(self.pm25, 0.1)

    # ---------- pm25 Data Generator -------------
    self.pm25_train_data_gen = K.preprocessing.sequence.TimeseriesGenerator(pm25_x_train, pm25_x_train, length=data_len, shuffle=True)
    self.pm25_val_data_gen = K.preprocessing.sequence.TimeseriesGenerator(pm25_x_val, pm25_x_val, length=data_len, shuffle=True)
    self.pm25_test_data_gen = K.preprocessing.sequence.TimeseriesGenerator(pm25_x_test, pm25_x_test, length=data_len, batch_size = 1, shuffle=False)

    # ---------- CNN result ------------
    self.cnn_output = self.cnn_model.predict(self.cnn_test_data_gen)
    rated_cnn_train_output = _make_lstm_input(self.cnn_output[data_len + 1:], self.pm25_train_data_gen)
    rated_cnn_val_output = _make_lstm_input(self.cnn_output[data_len + 1:], self.pm25_val_data_gen)
    rated_cnn_test_output = _make_lstm_input(self.cnn_output[data_len + 1:], self.pm25_test_data_gen)

    # ---------- LSTM data gen ---------
    self.lstm_train_data_gen = LSTMInputGenerator(pm25_x_train, pm25_x_train, data_len, rated_cnn_train_output)
    self.lstm_val_data_gen = LSTMInputGenerator(pm25_x_val, pm25_x_val, data_len, rated_cnn_val_output)
    self.lstm_test_data_gen = LSTMInputGenerator(pm25_x_test, pm25_x_test, data_len, rated_cnn_test_output)

  def cnn_model_fit(self epochs=1):
    self.cnn_model.fit(x=self.cnn_train_data_gen, epochs=epochs, validation_data=(self.cnn_val_data_gen), callbacks=self.cnn_callbacks)

  def lstm_model_fit(self, epochs=1):
    lstm_model.fit(x=self.lstm_train_data_gen, epochs=epochs, validation_data=self.lstm_val_data_gen, callbacks=self.lstm_callbacks)

  def total_model_test(self):
    return self.lstm_model.predict(self.lstm_test_data_gen)
    
  def _get_cnn_model(self):
    cnnModel = K.Sequential()
    cnnModel.add(K.layers.Conv2DTranspose(32, (2,2), input_shape=(1,x_train.shape[1],x_train.shape[2]), activation="relu"))
    cnnModel.add(K.layers.MaxPool2D(strides=2))
    cnnModel.add(K.layers.Flatten())
    cnnModel.add(K.layers.Dropout(0.1))
    cnnModel.add(K.layers.Dense(100, activation="relu"))
    cnnModel.add(K.layers.ReLU())
    cnnModel.add(K.layers.Dense(5, activation="softmax"))
    cnnModel.summary()
    cnnModel.compile(optimizer="adam", loss="MSE")

    return cnnModel

  def _get_lstm_model(self):
    lstm_model = K.Sequential()
    lstm_model.add(K.layers.LSTM(216, input_shape=(16,1)))
    lstm_model.add(K.layers.Dropout(0.3))
    lstm_model.add(K.layers.Dense(128, activation="relu"))
    lstm_model.add(K.layers.Dropout(0.3))
    lstm_model.add(K.layers.Dense(1, activation="sigmoid"))
    lstm_model.summary()
    lstm_model.compile(optimizer="adam", loss="MSE")

    return lstm_model

  def _cnn_y_split(data, data_length, rate=0.1):
    data = data[data_length-1:]
    data = K.utils.to_categorical(data)
    arrlen = int(len(data) * rate)

    train, val, test =  data[:-1 * (arrlen * 2)], data[-1 * (arrlen * 2) : -1 * (arrlen)], data[-1 * (arrlen):]
    return train, val, test

  def _get_grad_pm25(self):
    grad_data = self.pm25.pct_change()
    grad_data = grad_data.fillna(method="pad")
    bins = [-9.166667e-02, -1e-15,1e-15, 1.212121e-01]
    grad_level = np.digitize(grad_data, bins=bins, right=False)
    self.categorical_grad = K.utils.to_categorical(grad_level)

  def _get_rate(data):
    index = data.argmax()
    if index == 3:
      rate = 1
    else:
      rate = 1 + ((index-3) * 0.25) # 최대 50%의 변화율을 줘 봄

    return rate
  
  def _compute_with_data(data, value):
    rate = _get_rate(data)
    return rate * value

  def _make_lstm_input(cnn_output, lstm_data_gen):
    result = np.zeros(shape=(1,))
    for i in range(len(cnn_output)):
      lx, ly = lstm_data_gen[i]
      if i == 0:
        result = _compute_with_data(cnn_output[i], ly.squeeze(axis=1))
      else:
        result = np.hstack((result, compute_with_data(cnn_output[i], ly.squeeze(axis=1))))
    return result
        

SyntaxError: ignored