<a href="https://colab.research.google.com/github/YoshiyukiKono/gen-ai-works/blob/main/Transformers_101/Transformers_101_56_72.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 56本目

In [None]:
!pip install transformers==4.7.0



# 57本目

In [None]:
dataset = [["What music do you like?", "I like Rock music.", 1],
           ["What is your favorite food?", "I like sushi the best", 1],
           ["What is your favorite color?", "I'm going to be a doctor", 0],
           ["What is your favorite song?", "Tokyo olympic game in 2020 was postponed", 0],
           ["Do you like watching TV shows?", "Yeah, I often watch it in my spare time", 1]]

# 58本目

In [None]:
from transformers import BertPreTrainedModel, BertConfig, BertModel, BertTokenizer, AdamW
from torch import nn

class BertEnsembleForNextSentencePrediction(BertPreTrainedModel):
  def __init__(self, config, *args, **kwargs):
      super().__init__(config)

      # QA BERT model
      self.bert_model_1 = BertModel(config)
      # AQ BERT model
      self.bert_model_2 = BertModel(config)

      self.cls = nn.Linear(2 * self.config.hidden_size, 2)
      self.init_weights()

  def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          position_ids=None,
          head_mask=None,
          inputs_embeds=None,
          next_sentence_label=None,
  ):
    outputs = []
    input_ids_1 = input_ids[0]
    attention_mask_1 = attention_mask[0]
    outputs.append(self.bert_model_1(input_ids_1,
                                     attention_mask=attention_mask_1))

    input_ids_2 = input_ids[1]
    attention_mask_2 = attention_mask[1]
    outputs.append(self.bert_model_2(input_ids_2,
                                     attention_mask=attention_mask_2))

    # just get the [CLS] embeddings
    last_hidden_states = torch.cat([output[1] for output in outputs], dim=1)
    logits = self.cls(last_hidden_states)

    # crossentropyloss: https://pytorch.org/docs/stable/nn.html#crossentropyloss
    if next_sentence_label is not None:
      loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
      next_sentence_loss = loss_fct(logits.view(-1, 2), next_sentence_label.view(-1))
      return next_sentence_loss, logits
    else:
      return logits

# 59本目

In [None]:
import torch
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = BertConfig()
model = BertEnsembleForNextSentencePrediction(config)
model.to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
learning_rate = 1e-5

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [{
  "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
  }]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# 60本目

In [None]:
def prepare_data(dataset, qa=True):
  input_ids, attention_masks = [], []
  labels = []
  for point in dataset:
    if qa is True:
      q, a, _ = point
    else:
      a, q, _ = point
    encoded_dict = tokenizer.encode_plus(
      q,  # Sentence 1 to encode.
      a,  # Sentence 2 to encode.
      add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
      max_length=128,  # Pad & truncate all sentences.
      pad_to_max_length=True,
      return_attention_mask=True,  # Construct attn. masks.
      return_tensors='pt',  # Return pytorch tensors.
      truncation=True
    )
    input_ids.append(encoded_dict["input_ids"])
    attention_masks.append(encoded_dict["attention_mask"])
    labels.append(point[-1])
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  return input_ids, attention_masks, labels

# 61本目

In [None]:
import numpy as np
from torch.utils.data import DataLoader, RandomSampler, Dataset, SequentialSampler


class QADataset(Dataset):
  """
  returns the input_ids tensor and attention_mask tensor
  """
  def __init__(self, input_ids, attention_masks, labels=None):
    self.input_ids = np.array(input_ids)
    self.attention_masks = np.array(attention_masks)
    self.labels = torch.tensor(labels, dtype=torch.long)

  def __getitem__(self, index):
    return self.input_ids[index], self.attention_masks[index], self.labels[index]

  def __len__(self):
    return self.input_ids.shape[0]

# 62本目

In [None]:

input_ids_qa, attention_masks_qa, labels_qa = prepare_data(dataset)
train_dataset_qa = QADataset(input_ids_qa, attention_masks_qa, labels_qa)

input_ids_aq, attention_masks_aq, labels_aq = prepare_data(dataset, qa=False)
train_dataset_aq = QADataset(input_ids_aq, attention_masks_aq, labels_aq)

dataloader_qa =  DataLoader(dataset=train_dataset_qa,
                            batch_size=5,
                            sampler=SequentialSampler(train_dataset_qa))
dataloader_aq =  DataLoader(dataset=train_dataset_aq,
                            batch_size=5,
                            sampler=SequentialSampler(train_dataset_aq))



# 63本目

In [None]:
epochs = 30
for epoch in range(epochs):
  # iterate the QA and the AQ inputs simultaneously
  for step, combined_batch in enumerate(zip(dataloader_qa, dataloader_aq)):
    batch_1, batch_2 = combined_batch
    # training so, dropout needed to avoid overfitting
    model.train()

    # move input to GPU
    batch_1 = tuple(t.to(device) for t in batch_1)
    batch_2 = tuple(t.to(device) for t in batch_2)
    inputs = {
        "input_ids": [batch_1[0], batch_2[0]],
        "attention_mask": [batch_1[1], batch_2[1]],
        "next_sentence_label": batch_1[2]
    }
    outputs = model(**inputs)

    # model outputs are always tuple in transformers (see doc)
    loss = outputs[0]
    # backpass
    loss.backward()
    print(f"epoch:{epoch}, loss:{loss}")

    # re-calculate the weights
    optimizer.step()
    # again set the grads to 0 for next epoch
    model.zero_grad()


epoch:0, loss:0.7073392868041992
epoch:1, loss:0.8433060646057129
epoch:2, loss:0.5848076939582825
epoch:3, loss:0.6016868352890015
epoch:4, loss:0.6548543572425842
epoch:5, loss:0.5747431516647339
epoch:6, loss:0.5610212087631226
epoch:7, loss:0.43395811319351196
epoch:8, loss:0.5671840310096741
epoch:9, loss:0.36815375089645386
epoch:10, loss:0.31081685423851013
epoch:11, loss:0.4070233702659607
epoch:12, loss:0.3102841377258301
epoch:13, loss:0.25686076283454895
epoch:14, loss:0.21522541344165802
epoch:15, loss:0.291034996509552
epoch:16, loss:0.2017749845981598
epoch:17, loss:0.1630515158176422
epoch:18, loss:0.1411523073911667
epoch:19, loss:0.11451943218708038
epoch:20, loss:0.1354881078004837
epoch:21, loss:0.08884181082248688
epoch:22, loss:0.06300369650125504
epoch:23, loss:0.050922900438308716
epoch:24, loss:0.04049862548708916
epoch:25, loss:0.0456589050590992
epoch:26, loss:0.032207001000642776
epoch:27, loss:0.03241448476910591
epoch:28, loss:0.016544159501791
epoch:29, lo

# 64本目

In [None]:
input_ids_qa, attention_masks_qa, labels_qa = prepare_data(dataset)
test_dataset_qa = QADataset(input_ids_qa, attention_masks_qa, labels_qa)

input_ids_aq, attention_masks_aq, labels_aq = prepare_data(dataset, qa=False)
test_dataset_aq = QADataset(input_ids_aq, attention_masks_aq, labels_aq)

dataloader_qa =  DataLoader(dataset=test_dataset_qa,
                            batch_size=16,
                            sampler=SequentialSampler(test_dataset_qa))
dataloader_aq =  DataLoader(dataset=test_dataset_aq,
                            batch_size=16,
                            sampler=SequentialSampler(test_dataset_aq))

complete_outputs, complete_label_ids = [], []

# QAとAQの入力を同時にイテレート
for step, combined_batch in enumerate(zip(dataloader_qa, dataloader_aq)):
  # 順伝播処理のみのためドロップアウトはなし。
  model.eval()
  batch_1, batch_2 = combined_batch

  # GPUへ転送
  batch_1 = tuple(t.to(device) for t in batch_1)
  batch_2 = tuple(t.to(device) for t in batch_2)

  # 順伝播処理のみのため微分不要
  with torch.no_grad():
    inputs = {
        "input_ids": [batch_1[0], batch_2[0]],
        "attention_mask": [batch_1[1], batch_2[1]],
        "next_sentence_label": batch_1[2]
    }
    outputs = model(**inputs)
    tmp_eval_loss, logits = outputs[:2]
    logits = logits.detach().cpu().numpy()
    outputs = np.argmax(logits, axis=1)
    label_ids = inputs["next_sentence_label"].detach().cpu().numpy()
  complete_outputs.extend(outputs)
  complete_label_ids.extend(label_ids)

print(complete_outputs, complete_label_ids)

[1, 1, 0, 0, 1] [1, 1, 0, 0, 1]




# 65本目

In [None]:
dataset = [["What music do you like?", "I like Rock music.", 1]]
input_ids_qa, attention_masks_qa, labels_qa = prepare_data(dataset)
test_dataset_qa = QADataset(input_ids_qa, attention_masks_qa, labels_qa)

input_ids_aq, attention_masks_aq, labels_aq = prepare_data(dataset, qa=False)
test_dataset_aq = QADataset(input_ids_aq, attention_masks_aq, labels_aq)

dataloader_qa =  DataLoader(dataset=test_dataset_qa,
                            batch_size=16,
                            sampler=SequentialSampler(test_dataset_qa))
dataloader_aq =  DataLoader(dataset=test_dataset_aq,
                            batch_size=16,
                            sampler=SequentialSampler(test_dataset_aq))

complete_outputs, complete_label_ids = [], []

# QAとAQの入力を同時にイテレート
for step, combined_batch in enumerate(zip(dataloader_qa, dataloader_aq)):
  # 順伝播処理のみのためドロップアウトはなし。
  model.eval()
  batch_1, batch_2 = combined_batch

  # GPUへ転送
  batch_1 = tuple(t.to(device) for t in batch_1)
  batch_2 = tuple(t.to(device) for t in batch_2)

  #  順伝播処理のみのため微分不要
  with torch.no_grad():
    inputs = {
        "input_ids": [batch_1[0], batch_2[0]],
        "attention_mask": [batch_1[1], batch_2[1]],
        "next_sentence_label": batch_1[2]
    }
    outputs = model(**inputs)
    tmp_eval_loss, logits = outputs[:2]
    logits = logits.detach().cpu().numpy()
    outputs = np.argmax(logits, axis=1)
    label_ids = inputs["next_sentence_label"].detach().cpu().numpy()
  complete_outputs.extend(outputs)
  complete_label_ids.extend(label_ids)

print(complete_outputs, complete_label_ids)

[1] [1]




# おまけ

## テキストからイメージ取得

In [None]:
!apt install tesseract-ocr libtesseract-dev tesseract-ocr-jpn
!pip install pyocr

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libleptonica-dev tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  libleptonica-dev libtesseract-dev tesseract-ocr tesseract-ocr-eng
  tesseract-ocr-jpn tesseract-ocr-osd
0 upgraded, 6 newly installed, 0 to remove and 39 not upgraded.
Need to get 8,939 kB of archives.
After this operation, 32.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libleptonica-dev amd64 1.75.3-3 [1,308 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libtesseract-dev amd64 4.00~git2288-10f4998a-2 [1,447 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:5 http://archive

In [None]:
pip install opencv-python



In [None]:
pip install pytesseract



In [None]:
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = "tesseract"

In [None]:
img = cv2.imread("text.png")
text = pytesseract.image_to_string(img)
print(text)

Function
- TEXT -



## All in one

In [None]:
pip install pyforest

Collecting pyforest
  Downloading https://files.pythonhosted.org/packages/e5/ae/418f9bbcfb442bb6775f294451c97e134f84c8c4f47d75208419e4af7e13/pyforest-1.1.0.tar.gz
Building wheels for collected packages: pyforest
  Building wheel for pyforest (setup.py) ... [?25l[?25hdone
  Created wheel for pyforest: filename=pyforest-1.1.0-py2.py3-none-any.whl size=14606 sha256=f5ee632d0362d143de1572d7dfd1d23592e910f7e21c1c0b229e23b697004814
  Stored in directory: /root/.cache/pip/wheels/40/c6/da/43562aeea85b37f1a2b3d326f0f602f865000d2ada8a43625f
Successfully built pyforest
Installing collected packages: pyforest
Successfully installed pyforest-1.1.0


In [None]:
import pyforest
df = pd.DataFrame([{"Product": "Apple", "Price":100}, {"Product":"Melon", "Price":500}])
df.head()

Unnamed: 0,Product,Price
0,Apple,100
1,Melon,500


In [None]:
pip install pandas_profiling



In [None]:
!pip uninstall pandas_profiling

Uninstalling pandas-profiling-1.4.1:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/pandas_profiling-1.4.1.dist-info/*
    /usr/local/lib/python3.7/dist-packages/pandas_profiling/*
Proceed (y/n)? y
  Successfully uninstalled pandas-profiling-1.4.1


In [None]:
!pip install -U pandas_profiling

Collecting pandas_profiling
[?25l  Downloading https://files.pythonhosted.org/packages/3b/a3/34519d16e5ebe69bad30c5526deea2c3912634ced7f9b5e6e0bb9dbbd567/pandas_profiling-3.0.0-py2.py3-none-any.whl (248kB)
[K     |████████████████████████████████| 256kB 8.4MB/s 
[?25hCollecting PyYAML>=5.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)
[K     |████████████████████████████████| 645kB 49.8MB/s 
[?25hCollecting pydantic>=1.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/9f/f2/2d5425efe57f6c4e06cbe5e587c1fd16929dcf0eb90bd4d3d1e1c97d1151/pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1MB)
[K     |████████████████████████████████| 10.1MB 1.9MB/s 
[?25hCollecting phik>=0.11.1
[?25l  Downloading https://files.pythonhosted.org/packages/b7/ce/193e8ddf62d4be643b9b4b20e8e9c63b2f6a20f92778c0410c629f89bdaa/phik-0.11.2.tar.gz (1.1MB)


In [None]:
import pandas as pd
import pandas_profiling


hourse_price_report=pandas_profiling.ProfileReport(df).to_file('report.html')

Summarize dataset:   0%|          | 0/15 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Update Dictionary

In [None]:
dic1 = {"Panda":3}
dic2 = {"Lion":5}
dic1.update(dic2)
print(dic1)

{'Panda': 3, 'Lion': 5}


## Convert a list of strings to integers

In [None]:
list(map(int, ['1', '2', '3']))

[1, 2, 3]

## Scraping data from the website

In [None]:
import pandas as pd
data = pd.read_html("https://en.wikipedia.org/wiki/Bitcoin")

In [None]:
data[:3]

[                                              Bitcoin                                          Bitcoin.1
 0                             Prevailing bitcoin logo                            Prevailing bitcoin logo
 1                                       Denominations                                      Denominations
 2                                              Plural                                           bitcoins
 3                                              Symbol  ₿ (Unicode: .mw-parser-output .monospaced{font...
 4                                       Ticker symbol                                        BTC, XBT[b]
 5                                           Precision                                               10−8
 6                                            Subunits                                                NaN
 7   .mw-parser-output .nobold{font-weight:normal} ...                                       millibitcoin
 8                                         1⁄1

## Speed up your pandas

In [None]:
pip install modin[dask]

Collecting distributed<=2.19.0,>=2.12.0; extra == "dask"
[?25l  Downloading https://files.pythonhosted.org/packages/ed/5b/c489f407e1b48981c6c17b1e3458910882430f9fc4576b4bf9efef9140c6/distributed-2.19.0-py3-none-any.whl (643kB)
[K     |████████████████████████████████| 645kB 8.3MB/s 
Installing collected packages: distributed
  Found existing installation: distributed 1.25.3
    Uninstalling distributed-1.25.3:
      Successfully uninstalled distributed-1.25.3
Successfully installed distributed-2.19.0


In [None]:
import modin.pandas as pd

In [None]:
pd.DataFrame([10,30,30])


    from distributed import Client

    client = Client()



Unnamed: 0,0
0,10
1,30
2,30


# 66本目

In [None]:
!pip install transformers==4.8.1
!pip install sentencepiece

from transformers import BigBirdTokenizer, BigBirdForMaskedLM
import torch

tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdForMaskedLM.from_pretrained('google/bigbird-roberta-base')

Collecting transformers==4.8.1
  Downloading transformers-4.8.1-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 13.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.8 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.6 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.8.1
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 12.4 MB/s 
[

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=845731.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=775.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1017.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512568261.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model

BigBirdForMaskedLM(
  (bert): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(50358, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0): BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [None]:
inputs = ["I like reading [MASK].", "I like driving a [MASK].","The world is facing with a [MASK] [MASK] crisis. We are all suffering from infectious diseases."]
answers = ["I like reading book.", "I like driving a car.", "The world is facing with a pandemic crisis. We are all suffering from infectious diseases."]

In [None]:
encoded_inputs = []
encoded_labels =  []
for i, l in zip(inputs, answers):
  encoded_inputs.append(tokenizer(i, return_tensors="pt"))
  encoded_labels.append(tokenizer(l, return_tensors="pt")["input_ids"])

In [None]:
  for input, label in zip(encoded_inputs, encoded_labels):
    outputs = model(**input, labels=label)
    loss = outputs.loss
    logits = outputs.logits
    print(f"損失：{loss.item()}")

    print(f"予測：{' '.join([tokenizer.decode(logits[0][i].argmax(-1)) for i in range(1, len(logits[0]))])}")

    print(f"正解：{tokenizer.decode(label[0][1:-1])}")
    print('\n')

損失：11.18355655670166
予測：i like reading it . i
正解：I like reading book.


損失：8.30691146850586
予測：its like driving a car . a
正解：I like driving a car.


損失：4.2960591316223145
予測：the world is facing with a global health crisis . we are all suffering from infectious diseases . .
正解：The world is facing with a pandemic crisis. We are all suffering from infectious diseases.




# 70本目

In [None]:
!pip install transformers==4.6.1
!pip install sentencepiece
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
model_name = 'google/pegasus-xsum'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

Collecting transformers==4.6.1
  Downloading transformers-4.6.1-py3-none-any.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 7.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 58.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 15.6 MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.8 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.6.1
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 7.4 MB/s 


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1912529.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=87.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3520083.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1362.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2275329241.0, style=ProgressStyle(descr…




# 71本目

In [None]:
inputs = [
          """
          Pretraining large neural language models, such as BERT, has led to impressive gains on many natural language processing (NLP) tasks. However, most pretraining efforts focus on general domain corpora, such as newswire and Web. A prevailing assumption is that even domain-specific pretraining can benefit by starting from general-domain language models. Recent work shows that for domains with abundant unlabeled text, such as biomedicine, pretraining language models from scratch results in substantial gains over continual pretraining of general-domain language models.
          """
]
batch = tokenizer(inputs, truncation=True, padding='longest', return_tensors="pt").to(device)

# 72本目

In [None]:
translated = model.generate(**batch)
generated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
print(generated_text[0])

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


Pretraining large neural language models can lead to substantial gains over continual pretraining of general-domain language models.
