In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [3]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |▍                               | 10kB 26.1MB/s eta 0:00:01[K     |▉                               | 20kB 26.7MB/s eta 0:00:01[K     |█▎                              | 30kB 32.0MB/s eta 0:00:01[K     |█▊                              | 40kB 18.1MB/s eta 0:00:01[K     |██▏                             | 51kB 14.4MB/s eta 0:00:01[K     |██▋                             | 61kB 12.9MB/s eta 0:00:01[K     |███                             | 71kB 12.4MB/s eta 0:00:01[K     |███▌                            | 81kB 12.2MB/s eta 0:00:01[K     |████                            | 92kB 11.8MB/s eta 0:00:01[K     |████▍                           | 102kB 12.9MB/s eta 0:00:01[K     |████▊                           | 112kB 12.9MB/s eta 0:00:01[K     |█████▏                          | 

In [4]:
from google.colab import files
uploaded = files.upload()

Saving order.xlsx to order.xlsx


In [5]:
import io
import pandas as pd
import numpy as np
df = pd.read_excel(io.BytesIO(uploaded['order.xlsx']))

In [6]:
df = df.dropna()
df

Unnamed: 0,times,titles,contents,tags,label,label_revised
0,2019-11-08 13:36:15,(3030)德律-重大訊息 說明媒體報導本公司前高階主管搬走\r\nFCT生產線搶訂單16人...,\n序號\n1\n發言日期\n108/11/08\n發言時間\n13:36:15\n發言人\...,"['3030', '德律']",0,-1.0
1,2020-03-27 19:00:55,(3037)欣興-重大訊息 公告本公司董事會決議通過追加109年度資本預算\r\n及長交期設...,\n序號\n2\n發言日期\n109/03/27\n發言時間\n19:00:55\n發言人\...,"['3037', '欣興']",0,0.0
2,2020-03-20 16:03:04,"(3629)地心引力-重大訊息 公告本公司子公司GAMESWORD CO., LIMITED...",\n序號\n6\n發言日期\n109/03/20\n發言時間\n16:03:04\n發言人\...,"['3629', '地心引力']",0,1.0
3,2020-03-13 18:06:53,(3629)地心引力-重大訊息 公告本公司董事會決議通過授權遊戲事業群執行長兼營運長\r\n...,\n序號\n2\n發言日期\n109/03/13\n發言時間\n18:06:53\n發言人\...,"['3629', '地心引力']",0,1.0
4,2020-04-28 13:54:06,(5344)立衛-重大訊息 公告本公司接獲客戶訂單,\n序號\n2\n發言日期\n109/04/28\n發言時間\n13:54:06\n發言人\...,"['5344', '立衛']",0,1.0
...,...,...,...,...,...,...
684,2020-01-29 15:04:00,個股：宏全(9939)湖北廠整改中，其餘廠區配合延後復工，估計訂單不受影響,【財訊快報／記者陳浩寧報導】中國武漢肺炎疫情加劇，湖北省包含武漢等地已宣布封城，宏全(993...,[],0,0.0
685,2020-03-26 09:34:00,個股：宜特(3289)去年轉盈並配息2元，樂觀看今年，訂單能見度達第二季底,【財訊快報／記者李純君報導】電子產品驗證廠宜特(3289)2019年順利虧轉盈，繳出全年每股...,[],0,0.0
686,2020-06-20 08:00:00,個股：客戶訂單6月起回流，得力(1464)業績邁出谷底，7月起可望明顯回溫,【財訊快報／記者王宜弘報導】新冠疫情衝擊紡織供應鏈，得力(1464)第二季業績提前過冬，不過...,[],0,1.0
687,2020-05-27 12:56:00,個股：家登(3680)預付貨款已清，訂單太強，Q2與全年營收獲利將締新猷,【財訊快報／記者李純君報導】EUV光罩盒關鍵供應商家登(3680)傳出已經在今年第一季將台積...,[],1,1.0


In [7]:
#label can't be -1
df['label_revised'] = df['label_revised'] +1
df['label_revised'].unique()

array([0., 1., 2.])

In [8]:
#split contents -> 2/3:1/3
import re

def contents_trimmer(text, max_len=510):
       
    if(max_len > 510):
        print('error message: max_len must <= 510')
        exit()
    
    new_content=''
    
    if len(text) >= max_len:
        sent = re.split('，|\r\n\r\n\r\n\r\n|r\n\r\n\r\n|\r\n\r\n|\r\n',text)
        sent = [x for x in sent if x !='']
        new_content=sent[0]
        for i in range(1,len(sent)):
            if(len(new_content)+len(sent[i]) < int(max_len)*2/3):
                if(new_content[-1]!='。'):
                    new_content = new_content + '，' + sent[i]
                else:
                    new_content = new_content + sent[i]
            else:
                break
        
        tail_content=sent[-1]
        for i in range(2,len(sent)):
            if(len(tail_content)+len(sent[-i]) < int(max_len)*1/3):
                if(sent[-i]!='。'):
                    tail_content = sent[-i] + '，' + tail_content 
                else:
                    tail_content = sent[-i] + tail_content
            else:
                break
        return new_content +'。'+ tail_content
    
    else:
        sent = re.split('\r\n\r\n\r\n\r\n|r\n\r\n\r\n|\r\n\r\n|\r\n',text)
        new_content = ''.join(sent)
        return new_content

In [13]:
for i in range(len(df)):
    df['contents'].iloc[i] = contents_trimmer(df['contents'].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
# Get the lists of sentences and their labels.
sentences = df.contents.values
labels = df.label_revised.values

In [15]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=109540.0, style=ProgressStyle(descripti…




In [16]:
new_sent = []
for sent in sentences:
    if len(sent) > 510:
        new_sent.append(sent[0:510])
        print('too long!!')
    else:
        new_sent.append(sent)
#list to array
new_sent = np.array(new_sent)

too long!!
too long!!
too long!!


In [17]:
max_len = 0

# For every sentence...
for sent in new_sent:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  502


In [18]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in new_sent:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 510,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
#labels = torch.tensor(labels)
labels = torch.tensor(labels, dtype=torch.long)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'only_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you may want to check this is the right behavior.
Truncation was n

Original:  
序號
1
發言日期
108/11/08
發言時間
13:36:15
發言人
林江淮
發言人職稱
營業部副總經理
發言人電話
(02)2832-8918
主旨
說明媒體報導本公司前高階主管搬走FCT生產線搶訂單16人遭起訴乙案
符合條款
第
51
款
事實發生日
108/11/07
說明
1.事實發生日:108/11/07
2.公司名稱:德律科技股份有限公司
3.與公司關係(請輸入本公司或子公司):本公司
4.相互持股比例:不適用
5.傳播媒體名稱:聯合報等
6.報導內容:德律科技前高階主管搬走FTC生產線搶訂單 16人遭起訴
7.發生緣由:媒體報導本公司前高階主管搬走FCT生產線搶訂單16人遭起訴一案，
本公司特此說明，本公司前接獲舉報有員工涉有不誠信行為，
經內部調查確認後，依法於105年8月中向臺灣士林地方法院檢察署提出告訴，
頃經檢察官偵查終結，
對被告依違反證券交易法背信罪、營業秘密法及著作權法提起公訴。
8.因應措施:本公司當配合後續司法審判進行必要作為。
9.其他應敘明事項:無

Token IDs: tensor([  101,  2415,  5998,   122,  4634,  6241,  3189,  3309,  8692,   120,
         8111,   120,  8142,  4634,  6241,  3229,  7279,  8124,   131,  8216,
          131,  8115,  4634,  6241,   782,  3360,  3736,  3917,  4634,  6241,
          782,  5480,  4935,  4245,  3511,  6956,  1199,  5244,  5195,  4415,
         4634,  6241,   782,  7442,  6282,   113,  8150,   114, 11152,  8144,
          118,  8426,  8662,   712,  3192,  6303,  3209,  2054,  7768,  1841,
         2206,  3315,  1062,  1385,  1184,  7770,  7389,

In [19]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.6 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} testing samples'.format(test_size))

  413 training samples
   68 validation samples
  208 testing samples


In [20]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
#choose batch size = 16 ,otherwise GPU will run out of memory.
#try 8 or GPU died
batch_size = 8

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )
#test
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [21]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-chinese", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3, # The number of output labels--3 
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=624.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411577189.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [22]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (21128, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [23]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [24]:
from transformers import get_linear_schedule_with_warmup

#try epoch = 4, val loss keep increasing
#try epoch = 2
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [25]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    #change threshold in this line ex:0.6
    #pred_flat = []
    #for ~:
    #    int(tf.nn.softmax(logits[0])[1] > 0.6).append(pred_flat)
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [26]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [27]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of     52.    Elapsed: 0:00:35.

  Average training loss: 0.77
  Training epcoh took: 0:00:46

Running Validation...
  Accuracy: 0.78
  Validation Loss: 0.63
  Validation took: 0:00:03

Training...
  Batch    40  of     52.    Elapsed: 0:00:34.

  Average training loss: 0.63
  Training epcoh took: 0:00:44

Running Validation...
  Accuracy: 0.83
  Validation Loss: 0.50
  Validation took: 0:00:03

Training...
  Batch    40  of     52.    Elapsed: 0:00:34.

  Average training loss: 0.47
  Training epcoh took: 0:00:44

Running Validation...
  Accuracy: 0.79
  Validation Loss: 0.45
  Validation took: 0:00:03

Training...
  Batch    40  of     52.    Elapsed: 0:00:34.

  Average training loss: 0.37
  Training epcoh took: 0:00:44

Running Validation...
  Accuracy: 0.82
  Validation Loss: 0.47
  Validation took: 0:00:03

Training complete!
Total training took 0:03:08 (h:mm:ss)


In [29]:
#get the predicted value
def get_predictions(model, dataloader):
    model.eval()
    #save predictions
    predictions, true_labels = [], []

    #Predict
    for batch in dataloader:
        #add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs[0]
        #move to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        #store predictions
        predictions.extend(np.argmax(logits, axis=1).flatten().tolist())
        true_labels.extend(label_ids.tolist())
    #create data frame
    df_pred = pd.DataFrame({'pred': predictions, 'true': true_labels})
    acc = np.sum(df_pred['pred'] == df_pred['true']) / len(df_pred)
    return df_pred, acc

In [30]:
df_pred1, acc1 = get_predictions(model, train_dataloader)

In [31]:
df_pred2, acc2 = get_predictions(model, test_dataloader)

In [32]:
acc1

0.9128329297820823

In [33]:
acc2

0.7692307692307693

In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(df_pred2['true'], df_pred2['pred'])

array([[ 24,   4,   7],
       [  5,   8,  22],
       [  5,   5, 128]])

In [35]:
689*0.6

413.4

In [36]:
413/3

137.66666666666666