<a href="https://colab.research.google.com/github/ccwu0918/MathProgramming/blob/main/Chapter10/Chapter10-3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 第10章 圖片、語言處理的深度學習全貌（8～10節）
從這節開始要學習深度學習的自然語言處理。

若是在Google Colaboratory的環境下執行程式，請確定已將「硬體加速器」設定為「GPU」

In [None]:
#Colaboratory環境的設定
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MathProgramming/Chapter10

In [None]:
#函式庫的設定
!pip install -q -r ./requirements3.txt

## 10-9 試著利用Bert分類文本

In [None]:
import pandas as pd
data_file='./spam.csv'
df = pd.read_csv('./spam.csv')
print(df["label"].value_counts())
df

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
tf.config.run_functions_eagerly(False)

#載入執行事前處理的模組
bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_en_cased_preprocess/2")

In [None]:
test_preprocessed = bert_preprocess(["Hello World!"])
test_preprocessed

In [None]:
#將七成的資料分割成學習專用資料，再讓剩下的三成資料當成驗證專用資料使用
train_df = df[0: int(len(df)*0.7)]
test_df = df[int(len(df)*0.7):]

#利用前置處理模組處理字串
X_train =  bert_preprocess(train_df["text"])
X_test = bert_preprocess(test_df["text"])

#對標籤(Spam與Ham)執行Onehot encoding
Y_train = pd.get_dummies(train_df["label"]).values.astype(np.float32)
Y_test = pd.get_dummies(test_df["label"]).values.astype(np.float32)

In [None]:
#建立模型
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout

#輸入值為input_word_ids, input_mask, input_type_ids這3個。
inputs = dict(
      input_word_ids=Input(shape=(None,), dtype=tf.int32),
      input_mask=Input(shape=(None,), dtype=tf.int32),
      input_type_ids=Input(shape=(None,), dtype=tf.int32))

#從Tensorflow Hub載入Bert的模型
outputs = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1", trainable=True, name='bert_encoder')(inputs)
outputs = outputs["pooled_output"]
outputs = Dropout(0.1)(outputs)
#為了最終的輸出結果為2個（SpamとHam），在最後加上全連線層
outputs = Dense(2, activation="softmax", name='classifier')(outputs)
model = Model(inputs, outputs)

In [None]:
from official.nlp import optimization
EPOCHS = 3
num_train_steps =  len(train_df.index) * EPOCHS
num_warmup_steps = int(0.1*num_train_steps)

#這次使用的Optimizer為AdamW
optimizer = optimization.create_optimizer(init_lr=0.00003,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=['accuracy'])

#輸出模型的概要
model.summary()

In [None]:
#開始學習
hist = model.fit(X_train,Y_train,epochs=EPOCHS, validation_split=0.1)

In [None]:
#由於學習很耗費時間，可執行下列的程式碼，直接儲存與載入模型的權重。
#只要先儲存模型的權重，執行第10節以後的程式碼的時候，就能直接使用儲存的權重，模型也不需要重新學習。

#儲存學習所得的權重
#model.save_weights('./saved_models/model_bert_weights')

In [None]:
#載入權重
#model.load_weights('./saved_models/model_bert_weights')

## 10-10 試著評估以Bert分類文本的結果



In [None]:
#分類開始
pred = model.predict(X_test)

In [None]:
pred_labels = np.array([np.argmax(p) for p in pred])
actual_labels = np.array([np.argmax(t) for t in Y_test])
tmp = actual_labels == pred_labels
tmp.sum()/len(tmp)

In [None]:
#顯示混淆矩陣

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cf_matrix = confusion_matrix(actual_labels, pred_labels)

c = sns.heatmap(cf_matrix, annot=True, fmt="d")

label_dict = {"ham": 0, "spam":1}
c.set(xticklabels=label_dict, yticklabels=label_dict)
plt.plot()

In [None]:
#預測的文本
print("預測: " , pred_labels[0])
print(test_df.iloc[0]["text"])

In [None]:
print("預測: " , pred_labels[3])
print(test_df.iloc[3]["text"])