<a href="https://colab.research.google.com/github/ccwu0918/MathProgramming/blob/main/Chapter9/Chapter9-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第9章 了解深度學習處理時間序列資料的原理(6-10節)
接下來要學習RNN與CNN處理時間序列資料的原理。

若是在Google Colaboratory的環境下執行程式，請確定已將「硬體加速器」設定為「GPU」

In [None]:
#Colaboratory環境的設定
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MathProgramming/Chapter9

In [None]:
#函式庫的設定
!pip install -q -r ./requirements2.txt

In [None]:
#audio_dataset_3class檔案若還沒解壓縮，請將下列的程式碼的註解拿掉
#!unzip audio_dataset_3class.zip

## 9-6 事先整理分類聲音所需的必要資料

In [None]:
import librosa
import pandas as pd
import numpy as np
import IPython.display as ipd

#載入學習專用資料
train_data_dir ="./audio_dataset_3class/train/"
train_df = pd.read_csv("audio_dataset_3class/train.csv", index_col=0)

#載入驗證專用資料
test_data_dir ="./audio_dataset_3class/test/"
test_df = pd.read_csv("audio_dataset_3class/test.csv", index_col=0)


In [None]:
#顯示部分用於學習的聲音檔案的名稱與標籤
train_df.head()

In [None]:
#顯示所有標籤值與標籤值的數量
train_df["label"].value_counts()

In [None]:
#載入其中一個大提琴的聲音檔案
data, rate = librosa.load(train_data_dir+ train_df[train_df["label"] == "Cello"].index[0])

#播放載入的檔案
ipd.Audio(data = data, rate = rate)

In [None]:
#確認載入的大提琴的聲音資料格式
print(data.shape)
data

In [None]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences


sampling_rate = 8000
#將聲音長度截為3秒
audio_duration = 3
audio_length = sampling_rate * audio_duration

#根據檔案名稱載入聲音檔案
def _load_files(data_dir, filenames):
  result = []
  for i, filename in enumerate(filenames):
        file_path = data_dir + filename
        data, _ = librosa.core.load(file_path, sr=sampling_rate, res_type='kaiser_fast')
        result.append(data)

  return result


def create_audio_dataset(train_df, test_df, train_data_dir, test_data_dir, label_dict):

    dim = (audio_length, 1)
    train_filenames = train_df.index
    test_filenames = test_df.index

    #根據檔案名稱載入學習專用資料與驗證專用資料的聲音檔案
    _X_train = _load_files(train_data_dir, train_filenames)
    _X_test = _load_files(test_data_dir, test_filenames)

    #將聲音長度截成audio_length設定的長度(這次設定為3秒)
    _X_train = pad_sequences(_X_train, dtype='float32', maxlen=audio_length, padding='pre', truncating='pre', value=0.0).tolist()
    _X_test = pad_sequences(_X_test, dtype='float32', maxlen=audio_length, padding='pre', truncating='pre', value=0.0).tolist()

    #利用standardScaler將聲音資料的平均值修正為1，變異數修正為1
    scaler = StandardScaler()
    scaler = scaler.fit(_X_train + _X_test)
    _X_train = scaler.transform(_X_train)
    _X_test = scaler.transform(_X_test)

    X_train = np.empty((len(train_filenames), *dim))
    for index, data in enumerate(_X_train):
      X_train[index,] = [[d] for d in data]

    X_test = np.empty((len(test_filenames), *dim))
    for index, data in enumerate(_X_test):
      X_test[index,] = [[d] for d in data]


    #下列為建立label的部分
    labels_train = train_df["label"]
    labels_test = test_df["label"]

    y_train = np.empty(len(labels_train), dtype=int)
    for i, label in enumerate(labels_train):
        y_train[i] = label_dict[label]

    y_test = np.empty(len(labels_test), dtype=int)
    for i, label in enumerate(labels_test):
        y_test[i] = label_dict[label]

    #執行one-hot encoding
    Y_train = to_categorical(y_train, num_classes=len(label_dict))
    Y_test = to_categorical(y_test, num_classes=len(label_dict))

    return X_train, Y_train, X_test, Y_test


audio_label_dict = {"Cello": 0,"Clarinet":1, "Applause":2}
X_train, Y_train, X_test, Y_test = create_audio_dataset(train_df, test_df, train_data_dir, test_data_dir, audio_label_dict)

## 9-7 試著利用LSTN分類聲音

In [None]:
from tensorflow.keras.layers import Dense, LSTM, Dropout,Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

def create_lstm_model():
  input_shape = (audio_length, 1)

  #建立模型
  model_lstm = Sequential()
  model_lstm.add(LSTM(64, return_sequences=True, dropout=0.3 ,input_shape=input_shape))
  model_lstm.add(LSTM(64, return_sequences=False, dropout=0.3))
  model_lstm.add(Dense(units=len(audio_label_dict), activation="softmax"))
  model_lstm.compile(loss="categorical_crossentropy", optimizer=Adam(0.001), metrics=["acc"])
  return model_lstm

model_lstm = create_lstm_model()
#顯示模型的構造
model_lstm.summary()

In [None]:
#開始學習
history = model_lstm.fit(X_train, Y_train, batch_size=16, epochs=40, validation_split=0.1, verbose=1)

In [None]:
#由於要耗費許多時間學習，建議大家拿掉下列程式碼的註解符號，直接載入這個模型的權重。
#只要儲存了權重，執行第8節之後的程式就可以直接載入這個模型的權重，不需要重新學習

#儲存模型的權重
#model_lstm.save_weights('./saved_models/model_lstm_weights')

In [None]:
#載入儲存的模型
#model_lstm = create_lstm_model()
#model_lstm.load_weights('./saved_models/model_lstm_weights')

## 9-8 試著評估LSTM的分類結果

In [None]:
#開始預測
predictions = model_lstm.predict(X_test, verbose=1)
pred_labels = np.array([np.argmax(pred) for pred in predictions])
actual_labels = np.array([audio_label_dict[lab] for lab in test_df["label"]])

#計算正確率
tmp = actual_labels == pred_labels
tmp.sum()/len(tmp)

In [None]:
import matplotlib.pyplot as plt

#顯示評估函數與精確度的圖表
fig, ax = plt.subplots(2,1)
ax[0].plot(history.history["loss"], color="b", label="Training Loss")
ax[0].plot(history.history["val_loss"], color="g", label="Validation Loss")
ax[0].legend()

ax[1].plot(history.history["acc"], color="b", label="Training Accuracy")
ax[1].plot(history.history["val_acc"], color="g", label="Validation Accuracy")
ax[1].legend()

plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

#建立混淆矩陣
cf_matrix = confusion_matrix(actual_labels, pred_labels)

plt.figure(figsize=(13,13))
c = sns.heatmap(cf_matrix, annot=True, fmt="d")

#audio_label_dict = {"Cello": 0,"Clarinet":1, "Applause":2}
audio_label_list = ["Cello", "Clarinet", "Applause"]
c.set(xticklabels=audio_label_list, yticklabels=audio_label_list)
plt.plot()

## 9-9 試著利用CNN分類音樂

In [None]:
from tensorflow.keras.layers import Activation, Conv1D, MaxPooling1D, GlobalMaxPool1D,Dropout

def create_cnn_model():
  #建立模型
  input_shape = (audio_length, 1)
  model_cnn = Sequential()
  model_cnn.add(Conv1D(filters=128, kernel_size=9, padding='valid', input_shape=input_shape, activation='relu'))
  model_cnn.add(MaxPooling1D(pool_size=16))
  model_cnn.add(Dropout(rate=0.2))
  model_cnn.add(Conv1D(filters=64, kernel_size=3, padding='valid', activation='relu'))
  model_cnn.add(GlobalMaxPool1D())
  model_cnn.add(Dropout(rate=0.2))
  model_cnn.add(Dense(len(audio_label_dict), activation="softmax"))
  model_cnn.compile(optimizer=Adam(0.0001), loss="categorical_crossentropy", metrics=['acc'])
  return model_cnn

model_cnn = create_cnn_model()
#顯示模型的構造
model_cnn.summary()

In [None]:
history = model_cnn.fit(X_train, Y_train, batch_size=16, epochs=50, validation_split=0.1, verbose=1)

## 9-10 試著評估CNN的分類結果

In [None]:
#開始預測
predictions = model_cnn.predict(X_test, verbose=1)
pred_labels = np.array([np.argmax(pred) for pred in predictions])
actual_labels = np.array([audio_label_dict[lab] for lab in test_df["label"]])

#計算正確率
tmp = actual_labels == pred_labels
tmp.sum()/len(tmp)

In [None]:
#顯示評估函數與精確度的圖表
fig, ax = plt.subplots(2,1)
ax[0].plot(history.history["loss"], color="b", label="Training Loss")
ax[0].plot(history.history["val_loss"], color="g", label="Validation Loss")
ax[0].legend()

ax[1].plot(history.history["acc"], color="b", label="Training Accuracy")
ax[1].plot(history.history["val_acc"], color="g", label="Validation Accuracy")
ax[1].legend()

plt.show()

In [None]:
#建立混淆矩陣
cf_matrix = confusion_matrix(actual_labels, pred_labels)

plt.figure(figsize=(13,13))
c = sns.heatmap(cf_matrix, annot=True, fmt="d")

#audio_label_dict = {"Cello": 0,"Clarinet":1, "Applause":2}
audio_label_list = ["Cello", "Clarinet", "Applause"]
c.set(xticklabels=audio_label_list, yticklabels=audio_label_list)
plt.plot()