In [9]:
#NewsDataset class：將url中的Data轉成Bert input型態，包含tokens、masks、labels
#tokens：可以理解為字元向量，因為有作padding，也就是讓各向量長度相同，補0之動作(padding主要是讓向量們可以使用矩陣運算，加快訓練速度)
#masks ：用於區分是否為padding之element，tokens為1、paddings為0
#labels：資料一開始的標籤，也就是用於訓練的答案

import torch
import pandas as pd
from torch.utils.data import Dataset
from keras.preprocessing.sequence import pad_sequences


class NewsDataset(Dataset):
    def __init__(self, mode, url, tokenizer):
        assert mode in ["train", "predict"]  # 一般訓練你會需要 dev set
        self.mode = mode
        self.df = pd.read_csv(url, delimiter='\t', header=None)
        self.len = len(self.df)
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        
#-------you might adjust codes here!!!!!!!------reset indices-----With different input data, codes here should be modified correspondingly
        self.df = self.df.reset_index()
        self.df = self.df.loc[:, [0, 1]]
        self.df.columns = ['text', 'label']
        self.text =self.df['text'].values
        self.labels = self.df['label'].values
        
        #-------------convert to Bert-pretrained tokens-----------------
        self.tokens = []
        for sent in self.text:
        # `encode` will:(1) Tokenize the sentence.(2) Prepend the `[CLS]` token to the start. (3) Append the `[SEP]` token to the end.(4) Map tokens to their IDs.
           encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
           self.tokens.append(encoded_sent)
        
        #-------------padding-------------------------------------------
        MAX_LEN = max(len(self.tokens[i]) for i in range(len(self.tokens)))
        self.tokens = pad_sequences(self.tokens, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

        #-------------Create attention masks----------------------------
        attention_masks = []
        # Create a mask of 1s for each token followed by 0s for padding
        for seq in self.tokens:
           mask = [int(i>0) for i in seq]
           attention_masks.append(mask) 

        #-------------Convert to tensors--------------------------------
        self.tokens = torch.tensor(self.tokens)
        self.masks = torch.tensor(attention_masks)
        self.labels = torch.tensor(self.labels)       


    #---------inherit from Dataset needing define __len__ and __getitem__ methods--------------
    def __len__(self):
        return self.len
  
    def __getitem__(self, idx):
       if self.mode == "predict":
            token = self.tokens[idx]
            mask =self.masks[idx]
            label_tensor = None
       else:
            token = self.tokens[idx]
            mask =self.masks[idx]
            label = self.labels[idx]
       return (token, mask, label)


Using TensorFlow backend.


In [10]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def data_split(data_set,test_size=0.1,batch_size=20):
    #-------------------Use 90% for training and 10% for validation.
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(data_set.tokens, data_set.labels, random_state=2020, test_size=test_size)
    #-------------------Do the same for the masks.
    train_masks, validation_masks, _, _ = train_test_split(data_set.masks, data_set.labels, random_state=2020, test_size=test_size)

    # Create the DataLoader for our training set.
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    
    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
    return train_dataloader, validation_dataloader
    
#data_split[0] is train_dataloader while data_split[1] is validation_dataloader


In [11]:
import torch
import numpy as np
import random
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def trainprocess(model, train_dataloader, validation_dataloader):
    #---------------If there's a GPU available...
    if torch.cuda.is_available():    
        # Tell PyTorch to use the GPU.    
        device = torch.device("cuda")
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
    #---------------If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    # Tell pytorch to run this model on the GPU.
    model.cuda()


    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    # I believe the 'W' stands for 'Weight Decay fix"
    optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

    # Number of training epochs (authors recommend between 2 and 4)
    epochs = 4
    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)



    # This training code is based on the `run_glue.py` script here:
    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


    # Set the seed value all over the place to make this reproducible.
    # with the same seed value, one can produce same random list in every execution
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # Store the average loss after each epoch so we can plot them.
    loss_values = []

    # For each epoch...
    for epoch_i in range(0, epochs):
    
        # ========================================
        #               Training
        # ========================================
    
        # Perform one full pass over the training set.

        print("")
        print('================== Epoch {:} / {:} =================='.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_loss = 0

        # Put the model into training mode. 
        # Don't be mislead--the call to "train" just changes the *mode*, it doesn't *perform* the training.
        # "dropout" and "batchnorm" layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()
    
        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches and show execution time for each step
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            

            # Unpack this training batch from our dataloader. 
            # As we unpack the batch, we'll also copy each tensor to the GPU using the "to" method.
            # batch contains three pytorch tensors:
            #   [0]: input tokens 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].type(torch.LongTensor)
            b_input_mask = batch[1].type(torch.LongTensor)
            b_labels = batch[2].type(torch.LongTensor)
        
                    
            b_input_ids =  b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)
        
        
        
            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because accumulating the gradients is "convenient while training RNNs". 
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()

        
            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we have provided the "labels".
            # The documentation for this "model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        
               
            # The call to "model" always returns a tuple, so we need to pull the loss value out of the tuple.
            loss = outputs[0]

            # Accumulate the training loss over all of the batches so that we can calculate the average loss at the end. 
            # "loss" is a Tensor containing a single value; the ".item()" function just returns the Python value from the tensor.
            total_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)            
    
        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
        model.eval()

        # Tracking variables 
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
        
            # Add batch to GPU
            #batch = tuple(t.to(device) for t in batch)
        
            # Unpack the inputs from our dataloader
            #b_input_ids, b_input_mask, b_labels = batch
            b_input_ids = batch[0].type(torch.LongTensor)
            b_input_mask = batch[1].type(torch.LongTensor)
            b_labels = batch[2].type(torch.LongTensor)
        
        
            b_input_ids =  b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            # Telling the model not to compute or store gradients, saving memory and speeding up validation
            with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have not provided labels.
                # token_type_ids is the same as the "segment ids", which differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this "model" function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
            # Get the "logits" output by the model. The "logits" are the outputvalues prior to applying an activation function like the softmax.
            logits = outputs[0]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
        
            # Calculate the accuracy for this batch of test sentences.
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
            # Accumulate the total accuracy.
            eval_accuracy += tmp_eval_accuracy

            # Track the number of batches
            nb_eval_steps += 1

        # Report the final accuracy for this validation run.
        print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))

    print("")
    print("Training complete!")
    print("memory used:{:}".format(torch.cuda.memory_allocated(device=0)))
    print("cache used:{:}".format(torch.cuda.memory_cached(device=0)))
    del optimizer, scheduler
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    print("memory used:{:}".format(torch.cuda.memory_allocated(device=0)))
    print("cache used:{:}".format(torch.cuda.memory_cached(device=0)))
    return model


In [12]:
import torch
import gc, os
import jupyter_client
print("memory used:{:}".format(torch.cuda.memory_allocated(device=0)))
print("cache used:{:}".format(torch.cuda.memory_cached(device=0)))

#del model
torch.cuda.reset_max_memory_cached(device=0)

torch.cuda.init()
torch.cuda.ipc_collect()
torch.cuda.empty_cache()
#a=torch.cuda.memory_stats(device=0)
torch.cuda.reset_max_memory_allocated(device=0)

jupyter_client.KernelManager.shutdown_kernel
os._exit
print("memory:{:}".format(torch.cuda.memory_allocated(device=0)))
print("cache:{:}".format(torch.cuda.memory_cached(device=0)))


AttributeError: module 'torch._C' has no attribute '_cuda_memoryStats'

In [13]:
def model_prediction(model, tokenizer, sentences):
    #---------------If there's a GPU available...
    if torch.cuda.is_available():    
        # Tell PyTorch to use the GPU.    
        device = torch.device("cuda")
        print('There are %d GPU(s) available.' % torch.cuda.device_count())
        print('We will use the GPU:', torch.cuda.get_device_name(0))
    #---------------If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

   
    encoded_token=[]
    for sent in sentences:
      encoded_sent = tokenizer.encode(sent,add_special_tokens = True)
      encoded_token.append(encoded_sent)
 
    MAX_LEN = max(len(sent) for sent in encoded_token)
    encoded_id = pad_sequences(encoded_token, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    encoded_id=torch.tensor(encoded_id)  
    
      
    model.eval()   
    b_input_ids = encoded_id.type(torch.LongTensor)
    b_input_ids = b_input_ids.to(device)
    
    
    with torch.no_grad():             
        outputs = model(b_input_ids)
        
        logits = outputs[0]
        print(outputs[0])
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        result=np.argmax(logits, axis=1).flatten()
        print(result)

import os
#Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
def save_model(model, tokenizer, output_dir):
    # Create output directory if needed
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print("Saving model to %s" % output_dir)
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`
    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_to_save.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [14]:
from transformers import BertTokenizer, BertForSequenceClassification
model_version = 'C:/Users/vtteam/Documents/bert-base-uncased'   #此處可以更改訓練語言，中文pretrain model為'bert-base-chinese'
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    model_version,          # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2,               # The number of output labels--2 for binary classification and you can increase this for multi-class tasks.   
    output_attentions = False,    # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
tokenizer = BertTokenizer.from_pretrained(model_version)

#-------------------you can use either data url or it's path as input-----------------------------------------------
data_set_url='https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv'
#----you might need to adjust the codes in NewsDataSet class depending on which columns in CSV are training contents and labels
data_set = NewsDataset("train", data_set_url, tokenizer=tokenizer)


#----split your data as training set and validation set-------------------------------------------------------------
#----data_split(data_set,test_size=0.1,batch_size=20) as you can see test_size and batch_size have default values---
dataloader = data_split(data_set)
train_dataloader =dataloader[0]
validation_dataloader=dataloader[1]


#----train!!!!!------------------------------------------------------------------------------------------------------
model=trainprocess(model,train_dataloader,validation_dataloader)


#----use model to predict--------------------------------------------------------------------------------------------
sentences =['South Korea’s Kospi led losses among the region’s major markets as it dropped 6.86% while the Kosdaq index fell 7.5%.',
            'Stocks tumbled on Wednesday, reaching a new coronavirus crisis low as investors worried about the economic damage from the pandemic.',
            'That is a terrible movie',
            'Dow rebounds more than 1,000 points as Trump seeks $1 trillion in stimulus for coronavirus fight']
model_prediction(model, tokenizer, sentences)


#----save trained-model-----------------------------------------------------------------------------------------------
output_dir = './model_save/'
#save_model(model, tokenizer, output_dir)

OSError: Can't load config for 'C:/Users/vtteam/Documents/bert-base-uncased'. Make sure that:

- 'C:/Users/vtteam/Documents/bert-base-uncased' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'C:/Users/vtteam/Documents/bert-base-uncased' is the correct path to a directory containing a config.json file

