In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda:0')
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

There are 2 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


## Transformersのインストール

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/4c/a0/32e3a4501ef480f7ea01aac329a716132f32f7911ef1c2fac228acc57ca7/transformers-2.6.0-py3-none-any.whl (540kB)
[K     |████████████████████████████████| 542kB 7.4MB/s eta 0:00:01
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 12.9MB/s eta 0:00:01
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d6/e3/5e49e9a83fb605aaa34a1c1173e607302fecae529428c28696fb18f1c2c9/tokenizers-0.5.2-cp37-cp37m-manylinux1_x86_64.whl (5.6MB)
[K     |████████████████████████████████| 5.6MB 11.4MB/s eta 0:00:01
Collecting filelock
  Downloading https://files.pythonhosted.org/packages/93/83/71a2ee6158bb9f39a90c0dea1637f81d5eef866e188e1971a1b1ab01a35a/filelock-3.0.12-py3-none-any.whl
Building wheels for 

## Loading CoLA Dataset

- 文章が文法的に正しくないか正しいかのデータセット
- 0=unacceptable, 1=acceptable

In [3]:
import wget
import os

url = 'https://nyu-mll.github.io/CoLA/cola_public_1.1.zip'
wget.download(url, './cola_public_1.1.zip')

'./cola_public_1.1.zip'

In [4]:
!unzip cola_public_1.1.zip

Archive:  cola_public_1.1.zip
   creating: cola_public/
  inflating: cola_public/README      
   creating: cola_public/tokenized/
  inflating: cola_public/tokenized/in_domain_dev.tsv  
  inflating: cola_public/tokenized/in_domain_train.tsv  
  inflating: cola_public/tokenized/out_of_domain_dev.tsv  
   creating: cola_public/raw/
  inflating: cola_public/raw/in_domain_dev.tsv  
  inflating: cola_public/raw/in_domain_train.tsv  
  inflating: cola_public/raw/out_of_domain_dev.tsv  


In [5]:
import pandas as pd

df = pd.read_csv('./cola_public/raw/in_domain_train.tsv',
                 delimiter='\t',
                 header=None,
                 names=['sentence_source', 'label', 'label_notes', 'sentence'])

print('Number of training sentences: {:,}\n'.format(df.shape[0]))
df.sample(10)

Number of training sentences: 8,551



Unnamed: 0,sentence_source,label,label_notes,sentence
4029,ks08,1,,There arose a great storm.
2632,l-93,0,*,Doug cleared at the table.
5350,b_73,0,*,A tangerine isn't as much different from an or...
5621,c_13,1,,Everyone should be able to defend himself.
4116,ks08,1,,We made them take the money.
1649,r-67,0,*,Here's a knife which for you to cut up the oni...
2253,l-93,0,*,Monica moved at the cat.
1777,r-67,1,,Willy is taller than Bill by that much.
5724,c_13,1,,That Dan smokes in the office really bothers A...
7790,ad03,0,*,Any boy saw no one.


In [6]:
df.loc[df.label == 0].sample(5)[['sentence', 'label']]

Unnamed: 0,sentence,label
1356,The boy Bill and who I watched was vain.,0
725,The room was left angry.,0
3111,Cynthia devoured .,0
1784,Willy is taller than Bill by as much as I watc...,0
3028,I hunted the woods for game.,0


In [7]:
sentences = df.sentence.values
labels = df.label.values

## Tokenization & Input Formatting

In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [9]:
print('Original:', sentences[0])
print('Tokenized:', tokenizer.tokenize(sentences[0]))
print('Token IDs:', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

Original: Our friends won't buy this analysis, let alone the next one we propose.
Tokenized: ['our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.']
Token IDs: [2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012]


In [10]:
input_ids = []

for sent in sentences:
    encoded_sent = tokenizer.encode(sent, add_special_tokens=True)  # [CLS] [SEP]を付与する
    input_ids.append(encoded_sent)

print('Original:', sentences[0])
print('Token IDs:', input_ids[0])

Original: Our friends won't buy this analysis, let alone the next one we propose.
Token IDs: [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102]


In [11]:
print('Max sentence length:', max([len(sen) for sen in input_ids]))

Max sentence length: 47


In [12]:
!pip install keras

Collecting keras
[?25l  Downloading https://files.pythonhosted.org/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl (377kB)
[K     |████████████████████████████████| 378kB 9.9MB/s eta 0:00:01
Installing collected packages: keras
Successfully installed keras-2.3.1


## Padding & Attention Masks

In [13]:
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 64

print('Padding/truncating all sentences to %d values...' % MAX_LEN)
print('Padding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', value=0, truncating='post', padding='post')

Padding/truncating all sentences to 64 values...
Padding token: "[PAD]", ID: 0


Using TensorFlow backend.


In [14]:
input_ids.shape

(8551, 64)

In [15]:
# padはmask=0、それ以外のtokenはmask=1
attention_masks = []

for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

In [16]:
print(attention_masks[0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Training & Validation Split

In [17]:
!pip install scikit-learn



In [18]:
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

print(train_inputs.shape, validation_inputs.shape)
print(train_labels.shape, validation_labels.shape)
print(len(train_masks), len(validation_masks))

(7695, 64) (856, 64)
(7695,) (856,)
7695 856


## Converting to PyTorch Tensors

In [19]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [20]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [21]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f772d10c610>

In [22]:
validation_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f772d2c3cd0>

## BertForSequenceClassification

In [38]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [39]:
params = list(model.named_parameters())
len(params)

201

In [40]:
print('Embedding Layer')
for p in params[0:5]:
    print(p[0], str(tuple(p[1].size())))

Embedding Layer
bert.embeddings.word_embeddings.weight (30522, 768)
bert.embeddings.position_embeddings.weight (512, 768)
bert.embeddings.token_type_embeddings.weight (2, 768)
bert.embeddings.LayerNorm.weight (768,)
bert.embeddings.LayerNorm.bias (768,)


In [41]:
print('First Transformer')
for p in params[5:21]:
    print(p[0], str(tuple(p[1].size())))

First Transformer
bert.encoder.layer.0.attention.self.query.weight (768, 768)
bert.encoder.layer.0.attention.self.query.bias (768,)
bert.encoder.layer.0.attention.self.key.weight (768, 768)
bert.encoder.layer.0.attention.self.key.bias (768,)
bert.encoder.layer.0.attention.self.value.weight (768, 768)
bert.encoder.layer.0.attention.self.value.bias (768,)
bert.encoder.layer.0.attention.output.dense.weight (768, 768)
bert.encoder.layer.0.attention.output.dense.bias (768,)
bert.encoder.layer.0.attention.output.LayerNorm.weight (768,)
bert.encoder.layer.0.attention.output.LayerNorm.bias (768,)
bert.encoder.layer.0.intermediate.dense.weight (3072, 768)
bert.encoder.layer.0.intermediate.dense.bias (3072,)
bert.encoder.layer.0.output.dense.weight (768, 3072)
bert.encoder.layer.0.output.dense.bias (768,)
bert.encoder.layer.0.output.LayerNorm.weight (768,)
bert.encoder.layer.0.output.LayerNorm.bias (768,)


In [42]:
print('Output Layer')
for p in params[-4:]:
    print(p[0], str(tuple(p[1].size())))

Output Layer
bert.pooler.dense.weight (768, 768)
bert.pooler.dense.bias (768,)
classifier.weight (2, 768)
classifier.bias (2,)


## Optimizer & Learning Rate Scheduler

In [43]:
optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-8)

In [44]:
from transformers import get_linear_schedule_with_warmup

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
print(scheduler)

<torch.optim.lr_scheduler.LambdaLR object at 0x7f7724ba1090>


## Training Loop

In [45]:
import numpy as np

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [46]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [47]:
import random


seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

loss_values = []

for epoch_i in range(0, epochs):
    print('')
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
    print('Training')
    
    t0 = time.time()
    
    total_loss = 0
    
    model.train()
    
    for step, batch in enumerate(train_dataloader):
        elapsed = format_time(time.time() - t0)
        
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('Batch {:>5,} of {:>5,}.  Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()
        
        # labelsを与えるとモデルはlosswを返す
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    avg_train_loss = total_loss / len(train_dataloader)
    loss_values.append(avg_train_loss)
    
    print('')
    print('  Average training loss: {0:.2f}'.format(avg_train_loss))
    print('  Training epoch took: {:}'.format(format_time(time.time() - t0)))
    
    print('')
    print('Running Validation...')
    
    t0 = time.time()
    
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            # labelsを与えないとモデルはlogitsを返す
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
    
    print('  Accuracy: {0:.2f}'.format(eval_accuracy / nb_eval_steps))
    print('  Validation took: {:}'.format(format_time(time.time() - t0)))

print('')
print('Training complete!')

Epoch 1 / 4
Training
Batch    40 of   241.  Elapsed: 0:00:06.
Batch    80 of   241.  Elapsed: 0:00:13.
Batch   120 of   241.  Elapsed: 0:00:19.
Batch   160 of   241.  Elapsed: 0:00:26.
Batch   200 of   241.  Elapsed: 0:00:32.
Batch   240 of   241.  Elapsed: 0:00:39.

  Average training loss: 0.50
  Training epoch took: 0:00:39

Running Validation...
  Accuracy: 0.80
  Validation took: 0:00:01
Epoch 2 / 4
Training
Batch    40 of   241.  Elapsed: 0:00:07.
Batch    80 of   241.  Elapsed: 0:00:13.
Batch   120 of   241.  Elapsed: 0:00:20.
Batch   160 of   241.  Elapsed: 0:00:27.
Batch   200 of   241.  Elapsed: 0:00:34.
Batch   240 of   241.  Elapsed: 0:00:40.

  Average training loss: 0.30
  Training epoch took: 0:00:40

Running Validation...
  Accuracy: 0.82
  Validation took: 0:00:01
Epoch 3 / 4
Training
Batch    40 of   241.  Elapsed: 0:00:07.
Batch    80 of   241.  Elapsed: 0:00:14.
Batch   120 of   241.  Elapsed: 0:00:20.
Batch   160 of   241.  Elapsed: 0:00:27.
Batch   200 of   241.  