幹部內訓 2021.01.16.
# **文字探勘 Sesssion 3** 

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/YL-Cheng/Portfolio/blob/main/DAC_Internal_Training/jupyternb/TextMining_3.ipynb)

-----

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Portfolio/DAC_Internal_Training')

Mounted at /content/drive


In [None]:
!pip install transformers
import os
import time
import random
import datetime

import re
import jieba
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 18.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 47.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 34.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=82c9333320

## **定義函數**

In [None]:
# 確認執行環境 (CPU or GPU)
def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'

device = get_device()

In [None]:
# 固定所有隨機種子，確保結果可再現
def set_seeds(myseed=807):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    random.seed(myseed)
    np.random.seed(myseed)
    torch.manual_seed(myseed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(myseed)

set_seeds()

In [None]:
# 建立dataset與dataloader
def generate_dataset(inputs, labels, mode, masks=None):
    if masks:
        if mode in ['Train', 'Validation']:
            train_inputs, validation_inputs, train_labels, validation_labels =\
                    train_test_split(inputs, labels, random_state=807, test_size=0.1)
            train_masks, validation_masks, _, _ =\
                    train_test_split(masks, labels, random_state=807, test_size=0.1)
        
            if mode == 'Train':
                print(f'Training Data Length: {len(train_inputs)}')
                return TensorDataset(torch.tensor(train_inputs),
                                     torch.tensor(train_masks),
                                     torch.tensor(train_labels))
            elif mode == 'Validation':
                print(f'Validation Data Length: {len(validation_inputs)}')
                return TensorDataset(torch.tensor(validation_inputs),
                                     torch.tensor(validation_masks),
                                     torch.tensor(validation_labels))
        elif mode == 'Test':
            print(f'Testing Data Length: {len(inputs)}')
            return TensorDataset(torch.tensor(inputs),
                                 torch.tensor(masks),
                                 torch.tensor(labels))
    else:
        if mode in ['Train', 'Validation']:
            train_inputs, validation_inputs, train_labels, validation_labels =\
                    train_test_split(inputs, labels, random_state=807, test_size=0.1)
        
            if mode == 'Train':
                print(f'Training Data Length: {len(train_inputs)}')
                return TensorDataset(torch.tensor(train_inputs),
                                     torch.tensor(train_labels))
            elif mode == 'Validation':
                print(f'Validation Data Length: {len(validation_inputs)}')
                return TensorDataset(torch.tensor(validation_inputs),
                                     torch.tensor(validation_labels))
        elif mode == 'Test':
            print(f'Testing Data Length: {len(inputs)}')
            return TensorDataset(torch.tensor(inputs),
                                 torch.tensor(labels))

def generate_dataloader(inputs, labels, batch_size, mode, masks=None):
    if masks:
        dataset = generate_dataset(inputs, labels, mode, masks)
    
        print(f'{mode} DataLoader Constructed.')
        if mode == 'Train':
            return DataLoader(dataset,
                              sampler=RandomSampler(dataset),
                              batch_size=batch_size)
        elif mode in ['Validation', 'Test']:
            return DataLoader(dataset,
                              sampler=SequentialSampler(dataset),
                              batch_size=batch_size)
    else:
        dataset = generate_dataset(inputs, labels, mode)
    
        print(f'{mode} DataLoader Constructed.')
        if mode == 'Train':
            return DataLoader(dataset,
                              sampler=RandomSampler(dataset),
                              batch_size=batch_size)
        elif mode in ['Validation', 'Test']:
            return DataLoader(dataset,
                              sampler=SequentialSampler(dataset),
                              batch_size=batch_size)

In [None]:
# 定義協助計算正確率的函數
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# 監控執行時間
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## **匯入PTT資料及標籤**

In [None]:
BOARD_LIST = ["BoyGirl", "HatePolitics", "NBA", "Stock"]

BoyGirl = pd.read_csv("./ptt_csv/Boy_Girl_200.csv").drop(columns = "Unnamed: 0")
BoyGirl["source"] = "BoyGirl"
HatePolitics = pd.read_csv("./ptt_csv/HatePolitics_200.csv").drop(columns = "Unnamed: 0")
HatePolitics["source"] = "HatePolitics"
NBA = pd.read_csv("./ptt_csv/NBA_200.csv").drop(columns = "Unnamed: 0")
NBA["source"] = "NBA"
Stock = pd.read_csv("./ptt_csv/Stock_200.csv").drop(columns = "Unnamed: 0")
Stock["source"] = "Stock"

ptt = BoyGirl[0:1000].append(HatePolitics[0:1000])
ptt = ptt.append(NBA[0:1000])
ptt = ptt.append(Stock[0:1000]).reset_index().drop(columns = "index").rename(columns={"標題":"topic", "時間": "time", "內容": "content"})

ptt = ptt.dropna(how = "any")
ptt = ptt[~ptt['topic'].str.contains('公告')]
ptt['content'] = ptt['content'].str.replace('原文標題|完整新聞標題|原文連結|新聞網址|詳細報導請見|新聞來源|編輯|報導|發布時間|發佈時間|最後更新時間|原文內容|完整新聞內容|心得/評論|附註、心得、想法', '')

In [None]:
topics = list(ptt['topic'])
boards = list(ptt['source'])

symbols = r'，|,|。|\.|！|\!|？|\?|、|：|\:|⋯⋯|""|\(|\)|（|）|=|\+|「|」|『|』|《|》|〈|〉|【|】|〖|〗|［|］|<|>|/|\\|\n'
for i, t in enumerate(topics):
    t = re.sub('Re:|R:|Fw:', '', t).strip()
    t = re.sub(symbols, '', t).strip()
    t = re.sub('^\[\w+\]', '', t).strip()
    topics[i] = t

In [None]:
# 去除為空值的標題以及對應的labels
indices = [i for i, x in enumerate(topics) if len(x) == 0]
for crr, slc in enumerate(indices):
    del topics[slc-crr]
    del boards[slc-crr]

## **建立 LSTM 模型區分文章類別**

### **資料前處理**

#### 將標題文字轉為向量 (input data)

In [None]:
jieba.set_dictionary('./jupyternb/dict.txt.big')

# 對文章標題進行斷詞
topics_cut = []
for topic in topics:
    topic = re.sub(symbols, " ", topic)
    topic_cutlist = []
    for w in jieba.lcut(topic):
        if len(w) >= 1:
            topic_cutlist.append(w)
    topic_cut = re.sub("\s+", " ", " ".join(topic_cutlist).strip()) 
    topics_cut.append((topic_cut, topic_cut.split(), len(topic_cut.split())))

MAX_LEN = max([topic[2] for topic in topics_cut])

Building prefix dict from /content/drive/My Drive/Portfolio/DAC_Internal_Training/jupyternb/dict.txt.big ...
Dumping model to file cache /tmp/jieba.ud9289a7749bfc77fb0185e9e38d688c7.cache
Loading model cost 1.213 seconds.
Prefix dict has been built successfully.


In [None]:
# 建立字詞與id的mapping dictionary
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

word_to_idx = {}
for topic in topics_cut:
    for word in topic[1]:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)

In [None]:
input_ids = []
for topic in topics_cut:
    ids = [0]*MAX_LEN
    ids[:topic[2]] = [word_to_idx[w] for w in topic[1]]
    input_ids.append(ids)

print('Original: ', topics_cut[0][0])
print('Token IDs:', input_ids[0])

Original:  這樣子 還要 繼續 嗎
Token IDs: [0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#### 建立數值化的資料標籤 (labels)

In [None]:
board_id = {'BoyGirl':0, 'HatePolitics':1, 'NBA':2, 'Stock':3}
labels = [board_id[b] for b in boards]

### **拆分 Training Set 與 Testing Set**

In [None]:
tr_inputs, ts_inputs, tr_labels, ts_labels =\
    train_test_split(input_ids, labels, random_state=807, test_size=0.1)

### **建立 DataSet 與 DataLoader**

In [None]:
batch_size = 32
train_dataloader = generate_dataloader(tr_inputs, tr_labels, batch_size, 'Train')
validation_dataloader = generate_dataloader(tr_inputs, tr_labels, batch_size, 'Validation')

Training Data Length: 3184
Train DataLoader Constructed.
Validation Data Length: 354
Validation DataLoader Constructed.


## **建立 LSTM 模型**

In [None]:
class LSTMClassifier(nn.Module):
    """Very simple implementation of LSTM-based time-series classifier."""
    
    def __init__(self, n_vocab, embedding_dim, hidden_dim, n_layers, dropout=0.2):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 4)
        
        self.batch_size = None
        self.hidden = None
    
    def forward(self, x):
        embeddings = self.embeddings(x.t())
        h0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).requires_grad_().to(device)
        c0 = torch.zeros(self.n_layers, x.size(0), self.hidden_dim).requires_grad_().to(device)
        packed_out, (hn, cn) = self.lstm(embeddings, (h0,c0))
        out = self.fc(hn[-1,:,:]) 
        return out

In [None]:
lr = 0.0005
epochs = 30
best_acc = 0
patience, trials = 100, 0

embedding_dim = 200
hidden_dim = 256
n_layers = 3
modelLSTM = LSTMClassifier(len(word_to_idx), embedding_dim, hidden_dim, n_layers)
modelLSTM = modelLSTM.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.RMSprop(modelLSTM.parameters(), lr=lr)

- 觀察 LSTM classifier 的模型結構

In [None]:
modelLSTM.parameters

<bound method Module.parameters of LSTMClassifier(
  (embeddings): Embedding(7849, 200)
  (lstm): LSTM(200, 256, num_layers=3, dropout=0.2)
  (fc): Linear(in_features=256, out_features=4, bias=True)
)>

### **訓練模型**

In [None]:
loss_values = []

for epoch_i in range(epochs): 
    
    # Training
    print(f'[ Epoch {epoch_i+1} / {epochs} ]\n')
    t0 = time.time()
    
    modelLSTM.train()
    train_loss, train_accuracy = 0, 0
    nb_train_steps = 0
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)
        
        optimizer.zero_grad()        
        outputs = modelLSTM(b_input_ids)
        loss = criterion(outputs, b_labels)
        train_loss += loss.item()        
        loss.backward()
        
        outputs = outputs.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy = flat_accuracy(outputs, label_ids)
        train_accuracy += tmp_train_accuracy
        nb_train_steps += 1
        
        optimizer.step()
    
    avg_train_loss = train_loss / len(train_dataloader)            
    
    loss_values.append(avg_train_loss)
    print("  Average training loss: {0:.2f}, Accuracy: {1:.2f}".format(avg_train_loss, train_accuracy/nb_train_steps))
    print("  Training epcoh took: {:}\n".format(format_time(time.time() - t0)))
        
    # Validation
    t0 = time.time()
    
    modelLSTM.eval()
    eval_accuracy = 0
    nb_eval_steps = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_labels = batch[1].to(device)

        with torch.no_grad():        
            outputs = modelLSTM(b_input_ids)
        
        outputs = outputs.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(outputs, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}\n".format(format_time(time.time() - t0)))

[ Epoch 1 / 30 ]

  Batch    40  of    100.    Elapsed: 0:00:06.
  Batch    80  of    100.    Elapsed: 0:00:12.
  Average training loss: 1.26, Accuracy: 0.36
  Training epcoh took: 0:00:15

  Accuracy: 0.43
  Validation took: 0:00:00

[ Epoch 2 / 30 ]

  Batch    40  of    100.    Elapsed: 0:00:06.
  Batch    80  of    100.    Elapsed: 0:00:12.
  Average training loss: 0.88, Accuracy: 0.61
  Training epcoh took: 0:00:15

  Accuracy: 0.67
  Validation took: 0:00:00

[ Epoch 3 / 30 ]

  Batch    40  of    100.    Elapsed: 0:00:06.
  Batch    80  of    100.    Elapsed: 0:00:12.
  Average training loss: 0.57, Accuracy: 0.77
  Training epcoh took: 0:00:15

  Accuracy: 0.70
  Validation took: 0:00:00

[ Epoch 4 / 30 ]

  Batch    40  of    100.    Elapsed: 0:00:06.
  Batch    80  of    100.    Elapsed: 0:00:12.
  Average training loss: 0.34, Accuracy: 0.88
  Training epcoh took: 0:00:15

  Accuracy: 0.71
  Validation took: 0:00:00

[ Epoch 5 / 30 ]

  Batch    40  of    100.    Elapsed: 0:00

### 模型loss隨epoch數的變化

In [None]:
f = pd.DataFrame(loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training loss of the Model',
                  xaxis_title='Epoch',
                  yaxis_title='Loss')
fig.show()

### **測試模型表現**

In [None]:
test_dataloader = generate_dataloader(ts_inputs, ts_labels, batch_size, 'Test')

Testing Data Length: 394
Test DataLoader Constructed.


In [None]:
modelLSTM.eval()
predictions , true_labels = [], []

test_accuracy = 0
nb_test_steps = 0
for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_labels = batch[1].to(device)
    
    with torch.no_grad():
        outputs = modelLSTM(b_input_ids)
    
    outputs = outputs.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
        
    tmp_test_accuracy = flat_accuracy(outputs, label_ids)
    
    test_accuracy += tmp_test_accuracy
    nb_test_steps += 1
    
    predictions.append(outputs)
    true_labels.append(label_ids)
    
print("  Accuracy: {0:.2f}".format(test_accuracy/nb_test_steps))

  Accuracy: 0.81


-----

## **使用預訓練 BERT 模型區分文章類別**

### **資料前處理**

#### Tokenize

In [None]:
# 預訓練的BERT模型可以直接呼叫tokenizer來將文字向量化
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

MAX_LEN = max([len(topic) for topic in topics])

input_ids = []
attention_masks = []

for topic in topics:
    encoded_topic = tokenizer.encode(topic,
                                     add_special_tokens = False,
                                     max_length=MAX_LEN,
                                     pad_to_max_length=True,
                                     return_attention_mask=True)
    input_ids.append(encoded_topic)
    
    att_mask = [int(token_id > 0)for token_id in encoded_topic]
    attention_masks.append(att_mask)
    
# Print sentence 0, now as a list of IDs.
print('Original: ', topics[0])
print('Token IDs:', input_ids[0])
print('Attention Masks:', attention_masks[0])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=268943.0, style=ProgressStyle(descripti…

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.






The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).



Original:  這樣子還要繼續嗎
Token IDs: [6857, 3564, 2094, 6917, 6206, 5262, 5265, 1621, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Attention Masks: [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#### 建立數值化的資料標籤 (labels)

In [None]:
board_id = {'BoyGirl':0, 'HatePolitics':1, 'NBA':2, 'Stock':3}
labels = [board_id[b] for b in boards]

### **拆分 Training Set 與 Testing Set**

In [None]:
tr_inputs, ts_inputs, tr_labels, ts_labels =\
                    train_test_split(input_ids, labels, random_state=807, test_size=0.1)
tr_masks, ts_masks, _, _ =\
                    train_test_split(attention_masks, labels, random_state=807, test_size=0.1)

### **建立 DataSet 與 DataLoader**

In [None]:
batch_size = 32 ## 16 or 32
train_dataloader = generate_dataloader(tr_inputs, tr_labels, batch_size, 'Train', masks = tr_masks)
validation_dataloader = generate_dataloader(tr_inputs, tr_labels, batch_size, 'Validation', masks = tr_masks)

Training Data Length: 3184
Train DataLoader Constructed.
Validation Data Length: 354
Validation DataLoader Constructed.


## **建立 BERT 模型**

In [None]:
modelBERT = BertForSequenceClassification.from_pretrained('bert-base-chinese',
                                                      num_labels = 4,
                                                      output_attentions = False,
                                                      output_hidden_states = False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411577189.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
optimizer = AdamW(modelBERT.parameters(), lr = 2e-5)
epochs = 4 ## 2, 3, or 4
total_steps = len(train_dataloader) * epochs

# 調整 learning rate
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

- 觀察 BERT classifier 的模型結構 (遠比 LSTM 模型複雜)

In [None]:
modelBERT.parameters

<bound method Module.parameters of BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

### **訓練模型**

In [None]:
loss_values = []

for epoch_i in range(epochs): 
    
    # Training
    print(f'[ Epoch {epoch_i+1} / {epochs} ]\n')
    t0 = time.time()
    
    modelBERT.train()
    train_loss, train_accuracy = 0, 0
    nb_train_steps = 0
    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        modelBERT.zero_grad()        
        outputs = modelBERT(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask,
                        labels=b_labels)
        loss = outputs[0]
        train_loss += loss.item()        
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(modelBERT.parameters(), 1.0)
        
        logits = outputs[1]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_train_accuracy = flat_accuracy(logits, label_ids)
        train_accuracy += tmp_train_accuracy
        nb_train_steps += 1
        
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = train_loss / len(train_dataloader)            
    
    loss_values.append(avg_train_loss)
    print("  Average training loss: {0:.2f}, Accuracy: {1:.2f}".format(avg_train_loss, train_accuracy/nb_train_steps))
    print("  Training epcoh took: {:}\n".format(format_time(time.time() - t0)))
        
    # Validation
    t0 = time.time()
    
    modelBERT.eval()
    eval_accuracy = 0
    nb_eval_steps = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():        
            outputs = modelBERT(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}\n".format(format_time(time.time() - t0)))

[ Epoch 1 / 4 ]

  Batch    40  of    100.    Elapsed: 0:11:07.
  Batch    80  of    100.    Elapsed: 0:22:11.
  Average training loss: 0.72, Accuracy: 0.73
  Training epcoh took: 0:27:36

  Accuracy: 0.89
  Validation took: 0:00:58

[ Epoch 2 / 4 ]

  Batch    40  of    100.    Elapsed: 0:11:04.
  Batch    80  of    100.    Elapsed: 0:22:10.
  Average training loss: 0.20, Accuracy: 0.94
  Training epcoh took: 0:27:36

  Accuracy: 0.91
  Validation took: 0:00:58

[ Epoch 3 / 4 ]

  Batch    40  of    100.    Elapsed: 0:11:06.
  Batch    80  of    100.    Elapsed: 0:22:10.
  Average training loss: 0.11, Accuracy: 0.97
  Training epcoh took: 0:27:34

  Accuracy: 0.92
  Validation took: 0:00:58

[ Epoch 4 / 4 ]

  Batch    40  of    100.    Elapsed: 0:11:03.
  Batch    80  of    100.    Elapsed: 0:22:05.
  Average training loss: 0.07, Accuracy: 0.98
  Training epcoh took: 0:27:30

  Accuracy: 0.92
  Validation took: 0:00:58



### 模型loss隨epoch數的變化

In [None]:
f = pd.DataFrame(loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training loss of the Model',
                  xaxis_title='Epoch',
                  yaxis_title='Loss')
fig.show()

### **測試模型表現**

In [None]:
test_dataloader = generate_dataloader(ts_inputs, ts_labels, batch_size, 'Test', masks = ts_masks)

Testing Data Length: 394
Test DataLoader Constructed.


In [None]:
modelBERT.eval()
predictions , true_labels = [], []

test_accuracy = 0
nb_test_steps = 0
for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    with torch.no_grad():
        outputs = modelBERT(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)
    logits = outputs[0]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    tmp_test_accuracy = flat_accuracy(logits, label_ids)  
    test_accuracy += tmp_test_accuracy
    nb_test_steps += 1
    
    predictions.append(logits)
    true_labels.append(label_ids)
    
print("  Accuracy: {0:.2f}".format(test_accuracy/nb_test_steps))

  Accuracy: 0.94
