In [3]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
import pandas as pd
import math
import numpy as np
from sklearn.metrics import classification_report
import torch.nn.functional as F

In [5]:
import torch
import os
from tqdm import tqdm,trange
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [52]:
from pytorch_transformers import (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)

# Introduction

In this notebook, will introduce how to do text classification with XLNet, including:

- Load and preprocess data
- Parser data
- Make training data
- Train model
- Evaluate result
- Predict result

Tips:

**Also this notebook come with a post [NER with BERT in Action](https://medium.com/@yingbiao/ner-with-bert-in-action-936ff275bc73)**<br>
**Feel free to check it, hope that it could help you.**

## Load data

**Load CSV data**

In [28]:
data_path = "data/" 

In [29]:
data_file_address = "data/text_classification_dataset.csv"

In [35]:
# Fillna method can make same sentence with same sentence name
df_data = pd.read_csv(data_file_address,sep=",",encoding="utf-8",names=['labels','texts'])

In [36]:
df_data.columns

Index(['labels', 'texts'], dtype='object')

In [37]:
df_data.head(n=20)

Unnamed: 0,labels,texts
0,0,"god is great , the movie's not ."
1,0,. . . the whole thing succeeded only in making...
2,1,"light the candles , bring out the cake and don..."
3,1,"the story may not be new , but australian dire..."
4,1,you live the mood rather than savour the story .
5,1,". . . "" bowling for columbine "" remains a disq..."
6,1,occasionally amateurishly made but a winsome c...
7,0,"by the time you reach the finale , you're like..."
8,0,the best way to hope for any chance of enjoyin...
9,0,something must have been lost in the translati...


**Have a look labels**

In [38]:
df_data.labels.unique()

array([0, 1])

In [39]:
# Analyse the labels distribution
df_data.labels.value_counts()

1    5331
0    5331
Name: labels, dtype: int64

## Parser data

**Parser data into document structure**

In [40]:
# Get sentence data
sentences = df_data.texts.to_list()
sentences[0]

"god is great , the movie's not ."

In [41]:
# Get tag labels data
labels = df_data.labels.to_list()
print(labels[0])

0


**Make TAG name into index for training**

In [42]:
# Set a dict for mapping id to tag name
#tag2idx = {t: i for i, t in enumerate(tags_vals)}

# Recommend to set it by manual define, good for reusing
tag2idx={'negative': 0,
 'positive': 1}

In [43]:
tag2idx

{'negative': 0, 'positive': 1}

In [44]:
# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

In [45]:
tag2name

{0: 'negative', 1: 'positive'}

## Make tranning data

Make raw data into trainable data for XLNet, including:

- Set gpu environment
- Load tokenizer and tokenize
- Set 3 embedding, token embedding, mask word embedding, segmentation embedding
- Split data set into train and validate, then send them to dataloader

**Set up gpu environment**

In [46]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [47]:
n_gpu

0

### Load tokenizer

Remember to install sentencepiece with  'pip install sentencepiece'

In [114]:
# Manual define vocabulary address, if you download the model in local
# The vocabulary can download from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model"
vocabulary = 'data/xlnet-base-cased/xlnet-base-cased-spiece.model'

In [115]:
# Len of the sentence must be the same as the training model
# See model's 'max_position_embeddings' = 512
max_len  = 3

In [116]:
# With cased model, set do_lower_case = False
tokenizer = XLNetTokenizer(vocab_file=vocabulary,do_lower_case=False)

**Encode text into input ids embedding**

In [121]:
# Need to set <cls> at front and <sep> at the end, as what the paper do.

In [92]:
max_len  = 20
input_ids = []
for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
    # Add [CLS] at the front 
    # Add [SEP] at the end
    tempt_token = tokenizer.encode('<cls>')+tokens_a+tokenizer.encode('<sep>')
    
    
    # Pad the embedding for 
    
    input_ids.append(" ".join(tempt_token))
    
    if 5 > i:
        print("No.:%d, sentence: %s,token result:%s"%(i,sentence," ".join(tempt_token)))
    
    
    
    

No.:0, sentence: god is great , the movie's not .,token result:[CLS] ▁god ▁is ▁great ▁ , ▁the ▁movie ' s ▁not ▁ . [SEP]
No.:1, sentence: . . . the whole thing succeeded only in making me groggy .,token result:[CLS] ▁ . ▁ . ▁ . ▁the ▁whole ▁thing ▁succeeded ▁only ▁in ▁making ▁me ▁ gro ggy ▁ [SEP]
No.:2, sentence: light the candles , bring out the cake and don't fret about the calories because there's precious little substance in birthday girl -- it's simply , and surprisingly , a nice , light treat .,token result:[CLS] ▁light ▁the ▁candles ▁ , ▁bring ▁out ▁the ▁cake ▁and ▁don ' t ▁fret ▁about ▁the ▁calories ▁because [SEP]
No.:3, sentence: the story may not be new , but australian director john polson , making his american feature debut , jazzes it up adroitly .,token result:[CLS] ▁the ▁story ▁may ▁not ▁be ▁new ▁ , ▁but ▁ australia n ▁director ▁ john ▁ pol son [SEP]
No.:4, sentence: you live the mood rather than savour the story .,token result:[CLS] ▁you ▁live ▁the ▁mood ▁rather ▁than ▁ 

In [117]:
tokenizer.encode("Hello, my dog is cute")

[17, 11368, 19, 94, 2288, 27, 10920]

In [118]:
tokenizer.encode("i have a dream")

[17, 150, 47, 24, 2986]

In [120]:
tokenizer.encode("<cls>")

[3]

In [109]:
type(tokenizer.encode("<CLS> god is great , the movie's not . <SEP>"))

list

### Set token embedding

Pad or trim the text and label to fit the need for max len

In [101]:
# Make text token into id
input_ids = [ tokenizer.encode(token_text) + ([0] * (max_len - len(tokenizer.encode(token_text))))for token_text in tokenized_texts ]
print(input_ids[0])

[4145, 7416, 83, 3158, 7290, 27, 312, 17, 19, 18, 1432, 17, 26, 17, 23, 50, 17, 9, 4145, 83, 8186, 3158]


In [33]:
# Make label into id
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=max_len,  padding="post",
                     dtype="long", truncating="post")
print(tags[0])

[18  9  9  9 17 17  9  9  9  0  9  9  9  9  9  0  9  9  9  9  9 13  9  9
  9  9  9 19  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9]


### Set mask word embedding

In [34]:
# For fine tune of predict, with token mask is 1,pad token is 0
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
attention_masks[0];

### Set segment embedding(Seem like for sequance tagging task, it's not necessary to make this embedding)

In [36]:
# Since only one sentence, all the segment set to 0
segment_ids = [[0] * len(input_id) for input_id in input_ids]
segment_ids[0];

## Split data into train and validate

70% for training, 30% for validation

**Split all data**

In [37]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(input_ids, tags,attention_masks,segment_ids, 
                                                            random_state=4, test_size=0.3)

In [38]:
len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs)

(33571, 14388, 33571, 14388)

**Set data into tensor**

Not recommend tensor.to(device) at this process, since it will run out of GPU memory

In [39]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
tr_segs = torch.tensor(tr_segs)
val_segs = torch.tensor(val_segs)

**Put data into data loader**

In [40]:
# Set batch num
batch_num = 32

In [41]:
# Only set token embedding, attention embedding, no segment embedding
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

## Train model

**Load BERT model**

In [42]:
# In this document, contain confg(txt) and weight(bin) files
model_file_address = 'data/bert-base-cased'

In [44]:
# Will load config and weight with from_pretrained()
model = BertForTokenClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))

In [45]:
model;

In [47]:
# Set model to GPU,if you are using GPU machine
model.cuda();

In [48]:
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)

In [49]:
# Set epoch and grad max num
epochs = 5
max_grad_norm = 1.0

In [50]:
# Cacluate train optimiazaion num
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

### Set fine tuning method

**Manual optimizer**

In [52]:
# True: fine tuning all the layers 
# False: only fine tuning the classifier layers
FULL_FINETUNING = True

In [53]:
if FULL_FINETUNING:
    # Fine tune model all layer parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    # Only fine tune classifier parameters
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

### Fine-tuing model

In [54]:
# TRAIN loop
model.train();

In [None]:
print("***** Running training *****")
print("  Num examples = %d"%(len(tr_inputs)))
print("  Batch size = %d"%(batch_num))
print("  Num steps = %d"%(num_train_optimization_steps))
for _ in trange(epochs,desc="Epoch"):
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
        attention_mask=b_input_mask, labels=b_labels)
        if n_gpu>1:
            # When multi gpu, average it
            loss = loss.mean()
        
        # backward pass
        loss.backward()
        
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        
        # update parameters
        optimizer.step()
        optimizer.zero_grad()
        
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
        

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 33571
  Batch size = 32
  Num steps = 5250


Epoch:  20%|██        | 1/5 [10:21<41:26, 621.73s/it]

Train loss: 0.135118007581784


## Save model 

In [54]:
bert_out_address = 'data/bert_out_model/en09'

In [55]:
# Make dir if not exits
if not os.path.exists(bert_out_address):
        os.makedirs(bert_out_address)

In [56]:
# Save a trained model, configuration and tokenizer
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

In [57]:
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(bert_out_address, "pytorch_model.bin")
output_config_file = os.path.join(bert_out_address, "bert_config.json")

In [58]:
# Save model into file
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(bert_out_address)

'data/bert_out_model/en09/vocab.txt'

## Load model

In [59]:
model = BertForTokenClassification.from_pretrained(bert_out_address,num_labels=len(tag2idx))

In [60]:
# Set model to GPU
model.cuda();

In [61]:
if n_gpu >1:
    model = torch.nn.DataParallel(model)

## Eval model

In [62]:
# Evalue loop
model.eval();

In [63]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(batch_num))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    
#     if step > 2:
#         break
    
    with torch.no_grad():
        logits = model(input_ids, token_type_ids=None,
        attention_mask=input_mask,)
    
    # Get NER predict result
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    
    
    # Get NER true result
    label_ids = label_ids.to('cpu').numpy()
    
    
    # Only predict the real word, mark=0, will not calculate
    input_mask = input_mask.to('cpu').numpy()
    
    # Compare the valuable predict result
    for i,mask in enumerate(input_mask):
        # Real one
        temp_1 = []
        # Predict one
        temp_2 = []
        
        for j, m in enumerate(mask):
            # Mark=0, meaning its a pad word, dont compare
            if m:
                if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : # Exclude the X label
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break
        
            
        y_true.append(temp_1)
        y_pred.append(temp_2)

        

print("f1 socre: %f"%(f1_score(y_true, y_pred)))
print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))

# Get acc , recall, F1 result report
report = classification_report(y_true, y_pred,digits=4)

# Save the report into file
output_eval_file = os.path.join(bert_out_address, "eval_results.txt")
with open(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    print("\n%s"%(report))
    print("f1 socre: %f"%(f1_score(y_true, y_pred)))
    print("Accuracy score: %f"%(accuracy_score(y_true, y_pred)))
    
    writer.write("f1 socre:\n")
    writer.write(str(f1_score(y_true, y_pred)))
    writer.write("\n\nAccuracy score:\n")
    writer.write(str(accuracy_score(y_true, y_pred)))
    writer.write("\n\n")  
    writer.write(report)

***** Running evaluation *****
  Num examples =9592
  Batch size = 32
f1 socre: 0.835420
Accuracy score: 0.971943
***** Eval results *****

           precision    recall  f1-score   support

      org     0.7392    0.6954    0.7166      3897
      geo     0.8490    0.9043    0.8758      7339
      eve     0.3333    0.4412    0.3797        68
      per     0.7513    0.8004    0.7750      3366
      tim     0.8535    0.8742    0.8637      4031
      art     0.1746    0.2651    0.2105        83
      gpe     0.9590    0.9396    0.9492      3213
      nat     0.5517    0.4571    0.5000        35

micro avg     0.8245    0.8466    0.8354     22032
macro avg     0.8269    0.8466    0.8361     22032

f1 socre: 0.835420
Accuracy score: 0.971943
