In [2]:
from transformers import BertTokenizer

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [4]:
inputs = tokenizer("I love cats.")
inputs

{'input_ids': [101, 1045, 2293, 8870, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [5]:
tokenizer.decode(inputs["input_ids"])

'[CLS] i love cats. [SEP]'

In [6]:
inputs = tokenizer("I love cats.", "He don't like dogs.")
inputs

{'input_ids': [101, 1045, 2293, 8870, 1012, 102, 2002, 2123, 1005, 1056, 2066, 6077, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
inputs = tokenizer(["I love cats.", "He don't like dogs."])
inputs

{'input_ids': [[101, 1045, 2293, 8870, 1012, 102], [101, 2002, 2123, 1005, 1056, 2066, 6077, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [9]:
inputs = tokenizer(
    ["I love cats.", "He don't like dogs."], 
    ["It's Greek to me.", "I can't understand it."]
)
inputs

{'input_ids': [[101, 1045, 2293, 8870, 1012, 102, 2009, 1005, 1055, 3306, 2000, 2033, 1012, 102], [101, 2002, 2123, 1005, 1056, 2066, 6077, 1012, 102, 1045, 2064, 1005, 1056, 3305, 2009, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [10]:
tokenizer.decode(inputs['input_ids'][0])

"[CLS] i love cats. [SEP] it's greek to me. [SEP]"

In [11]:
inputs = tokenizer(
    ["I love cats.", "He don't like dogs."], 
    ["It's Greek to me.", "I can't understand it."],
    padding=True,
)
inputs

{'input_ids': [[101, 1045, 2293, 8870, 1012, 102, 2009, 1005, 1055, 3306, 2000, 2033, 1012, 102, 0, 0, 0], [101, 2002, 2123, 1005, 1056, 2066, 6077, 1012, 102, 1045, 2064, 1005, 1056, 3305, 2009, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [12]:
inputs = tokenizer(
    ["I love cats.", "He don't like dogs."], 
    ["It's Greek to me.", "I can't understand it."],
    padding=True,
    return_tensors="pt",
)
inputs

{'input_ids': tensor([[ 101, 1045, 2293, 8870, 1012,  102, 2009, 1005, 1055, 3306, 2000, 2033,
         1012,  102,    0,    0,    0],
        [ 101, 2002, 2123, 1005, 1056, 2066, 6077, 1012,  102, 1045, 2064, 1005,
         1056, 3305, 2009, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [23]:
from transformers import BertModel
model = BertModel.from_pretrained('bert-base-uncased')

In [24]:
inputs = tokenizer("It's Greek to me.", return_tensors='pt', padding=True)
inputs.input_ids

tensor([[ 101, 2009, 1005, 1055, 3306, 2000, 2033, 1012,  102]])

In [29]:
outputs = model(**inputs)
outputs

(tensor([[[ 0.0156,  0.2564, -0.3399,  ..., -0.2428,  0.2830,  0.7544],
          [ 0.4662,  0.1092, -0.7984,  ..., -0.2918,  0.2954,  0.4456],
          [ 0.5022, -0.1174, -0.0162,  ..., -0.3971,  0.0185,  0.2980],
          ...,
          [-0.0512,  0.5962, -0.2460,  ...,  0.0019,  0.0570,  1.0659],
          [ 0.7567,  0.0346, -0.2691,  ...,  0.1350, -0.5766, -0.4198],
          [ 0.8299,  0.1093,  0.0729,  ...,  0.1760, -0.7133, -0.2550]]],
        grad_fn=<NativeLayerNormBackward>),
 tensor([[-0.9133, -0.3905, -0.7854,  0.8402,  0.6667, -0.1750,  0.8700,  0.2264,
          -0.7219, -1.0000, -0.3437,  0.9403,  0.9784,  0.3174,  0.9509, -0.6883,
          -0.2414, -0.5943,  0.2957, -0.6338,  0.6895,  0.9999,  0.1177,  0.2967,
           0.4354,  0.9831, -0.7207,  0.9372,  0.9545,  0.7353, -0.7154,  0.1956,
          -0.9896, -0.0025, -0.8013, -0.9888,  0.3763, -0.6817,  0.1052,  0.1373,
          -0.9130,  0.2164,  1.0000, -0.5338,  0.1740, -0.2897, -1.0000,  0.3349,
          -0.90