In [0]:
# install pretrained BERT model
!pip3 install pytorch_pretrained_bert

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/5d/3c/d5fa084dd3a82ffc645aba78c417e6072ff48552e3301b1fa3bd711e03d4/pytorch_pretrained_bert-0.6.1-py3-none-any.whl (114kB)
[K    100% |████████████████████████████████| 122kB 4.8MB/s 
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.1


In [0]:
'''
  code by Minho Ryu @bzantium
  
'''

# Sentiment Analysis

import torch.nn as nn
import torch.optim as optim
import numpy as np
from pytorch_pretrained_bert import BertModel, BertTokenizer
from copy import deepcopy

from torch import LongTensor as LT

# words sentences
sentences = ["i love you", "he loves me", "she likes baseball", "i hate you", "sorry for that", "this is awful"]
targets = [1, 1, 1, 0, 0, 0]  # 1 is good, 0 is bad.

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

inputs = []
for sent in sentences:
    tokens =  ['[CLS]'] + tokenizer.tokenize(sent) + ['[SEP]']
    sequence = tokenizer.convert_tokens_to_ids(tokens)
    inputs.append(sequence)

input_batch = LT(inputs)
target_batch = LT(targets)

input_dims = 768
output_dims = 2

BERT_model = BertModel.from_pretrained('bert-base-uncased')


class BERTEncoder(nn.Module):
    def __init__(self):
        super(BERTEncoder, self).__init__()
        self.encoder = deepcopy(BERT_model)

    def forward(self, x, token_id=None, mask=None):
        _, feat = self.encoder(x, token_type_ids=token_id, attention_mask=mask, output_all_encoded_layers=False)
        return feat


class BERTClassifier(nn.Module):
    def __init__(self, input_dims, output_dims):
        super(BERTClassifier, self).__init__()
        self.dropout = nn.Dropout(p = 0.1)
        self.classifier = nn.Linear(input_dims, output_dims)
    def forward(self, x):
        x = self.dropout(x)
        out = self.classifier(x)
        return out

class BERTForSequenceClassifier(nn.Module):
    def __init__(self, input_dims, output_dims):
        super(BERTForSequenceClassifier, self).__init__()
        self.encoder = BERTEncoder()
        self.classifier = BERTClassifier(input_dims, output_dims)
    
    def forward(self, x):
        x = self.encoder(x)
        out = self.classifier(x)
        return out
        
        
model = BERTForSequenceClassifier(input_dims, output_dims)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training
for epoch in range(30):
    optimizer.zero_grad()
    output = model(input_batch)

    # output : [batch_size, num_classes], target_batch : [batch_size] (LongTensor, not one-hot)
    loss = criterion(output, target_batch)
    if (epoch + 1) % 5 == 0:
        print('Epoch:', '%02d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()
    
# Test
test_text = ['sorry hate you', 'you love me']
tests = []
for sent in test_text:
    tokens =  ['[CLS]'] + tokenizer.tokenize(sent) + ['[SEP]']
    sequence = tokenizer.convert_tokens_to_ids(tokens)
    tests.append(sequence)

test_batch = LT(tests)

# Predict
model.eval()
result = model(test_batch).data.max(1)[1]
for i, text in enumerate(test_text):
  if result[i] == 1:
      print("\'" + text + "\'", "is good :)")
  else:
      print("\'"+ text + "\'", "is bad :(")

Epoch: 05 cost = 0.294268
Epoch: 10 cost = 0.034565
Epoch: 15 cost = 0.005079
Epoch: 20 cost = 0.001509
Epoch: 25 cost = 0.000890
Epoch: 30 cost = 0.000525
'sorry hate you' is bad :(
'you love me' is good :)


In [0]:
# Next Sentence Prediction

import torch.nn as nn
import torch.optim as optim
import numpy as np
from termcolor import colored
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification

from torch import LongTensor as LT


# words sentences
next_sentences = ["Once upon a time, there was a kind girl named Cinderella.",
                  "All of the animals loved her, especially two mice named Gus and Jaq.", 
                  "They'd do anything for the girl they called Cinderelly.",
                  "Cinderella lived with her stepmother and her two stepsisters, Anastasia and Drizella.",
                  "They were very mean to Cinderella, making her work all day cleaning, sewing, and cooking.",
                  "She tried her best to make them happy.",
                  "Cinderella's stepmother, Lady Tremaine, was cold, cruel, and jealous of Cinderella’s charm and beauty."]

not_next_sentences = ["One day, a messenger arrived with a special invitation.",
                      "Lady Tremaine didn't want Cinderella to go to the ball.",
                      "She wanted the Prince to meet Anastasia and Drizella.",
                      "It was a bit old-fashioned, but Cinderella could make it beautiful!",
                      "Cinderella was overjoyed when she saw the dress.",
                      "They ripped the dress and pulled off the beads.",
                      "Suddenly, her fairy godmother appeared.",
                      "Lady Tremaine didn't stop them.",
                      "Cinderella's dream of going to the ball was through.",
                      "At the ball, Prince Charming couldn't take his eyes off Cinderella.",
                      "The orchestra played, and the Prince began to dance with the wonderful girl whose name he still didn't know.",
                      "When the clock struck midnight, the magic spell would wear off!"]

targets = [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]  # 1 is Next, 0 is Not Next.

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

inputs = []
attn_masks = []
segment_ids = []
maxlen = 50

for i in range(len(next_sentences) - 1):
    token_a = ['[CLS]'] + tokenizer.tokenize(next_sentences[i]) + ['[SEP]']
    token_b = tokenizer.tokenize(next_sentences[i+1]) + ['[SEP]']
    tokens =  token_a + token_b
    sequence = tokenizer.convert_tokens_to_ids(tokens)
    seqlen = len(sequence)
    padding = [0] * (maxlen - seqlen)
    sequence += padding
    attn_mask = [1] * seqlen + padding
    segment_id = [0] * len(token_a) + [1] * len(token_b) + padding
    inputs.append(sequence)
    attn_masks.append(attn_mask)
    segment_ids.append(segment_id)

for i in range(0, len(not_next_sentences) - 1, 2):
    token_a = ['[CLS]'] + tokenizer.tokenize(not_next_sentences[i]) + ['[SEP]']
    token_b = tokenizer.tokenize(not_next_sentences[i+1]) + ['[SEP]']
    tokens =  token_a + token_b
    sequence = tokenizer.convert_tokens_to_ids(tokens)
    seqlen = len(sequence)
    padding = [0] * (maxlen - seqlen)
    sequence += padding
    attn_mask = [1] * seqlen + padding
    segment_id = [0] * len(token_a) + [1] * len(token_b) + padding
    inputs.append(sequence)
    attn_masks.append(attn_mask)
    segment_ids.append(segment_id)
    
input_batch = LT(inputs)
target_batch = LT(targets)
attn_masks = LT(attn_masks)
segment_ids = LT(segment_ids)

input_dims = 768
output_dims = 2


model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = output_dims)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training
for epoch in range(30):
    optimizer.zero_grad()
    output = model(input_batch, token_type_ids=segment_ids, attention_mask=attn_masks)

    # output : [batch_size, num_classes], target_batch : [batch_size] (LongTensor, not one-hot)
    loss = criterion(output, target_batch)
    if (epoch + 1) % 5 == 0:
        print('Epoch:', '%02d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

    loss.backward()
    optimizer.step()
    
# Test
test_text = ["Once upon a time, there was a kind girl named Cinderella.",
             "All of the animals loved her, especially two mice named Gus and Jaq.",
             "The orchestra played, and the Prince began to dance with the wonderful girl whose name he still didn't know."]

tests = []
test_attn_masks = []
test_segment_ids = []

for i in range(len(test_text) - 1):
    token_a = ['[CLS]'] + tokenizer.tokenize(test_text[i]) + ['[SEP]']
    token_b = tokenizer.tokenize(test_text[i+1]) + ['[SEP]']
    tokens =  token_a + token_b
    sequence = tokenizer.convert_tokens_to_ids(tokens)
    seqlen = len(sequence)
    padding = [0] * (maxlen - seqlen)
    sequence += padding
    attn_mask = [1] * seqlen + padding
    segment_id = [0] * len(token_a) + [1] * len(token_b) + padding
    tests.append(sequence)
    test_attn_masks.append(attn_mask)
    test_segment_ids.append(segment_id)

test_batch = LT(tests)
test_segment_ids = LT(test_segment_ids)
test_attn_masks = LT(test_attn_masks)

# Predict
model.eval()
result = model(test_batch, token_type_ids=test_segment_ids, attention_mask=test_attn_masks).data.max(1)[1]
for i in range(len(test_text) - 1):
  if result[i] == 1:
      print("\'" + test_text[i+1] + "\' is", colored('next sentence', 'blue'), "of \'" + test_text[i] + "\'")
  else:
      print("\'" + test_text[i+1] + "\' is", colored('not next sentence', 'red'), "of \'" + test_text[i] + "\'")

Epoch: 05 cost = 0.281067
Epoch: 10 cost = 0.063344
Epoch: 15 cost = 0.013235
Epoch: 20 cost = 0.005955
Epoch: 25 cost = 0.002468
Epoch: 30 cost = 0.001741
'All of the animals loved her, especially two mice named Gus and Jaq.' is [34mnext sentence[0m of 'Once upon a time, there was a kind girl named Cinderella.'
'The orchestra played, and the Prince began to dance with the wonderful girl whose name he still didn't know.' is [31mnot next sentence[0m of 'All of the animals loved her, especially two mice named Gus and Jaq.'
