In [1]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sn
from pprint import pprint
import textwrap

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# PyTorch
import torch

# Hugging Face 
from transformers import pipeline, set_seed, AutoTokenizer
from transformers import AutoModelForSequenceClassification

# Scikit-learn performance metrics
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, f1_score 
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import auc as calculate_auc

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import transformermodels as tm
print(f'Package version: {tm.__version__}')
print(f'PyTorch version: {torch.__version__}')

Package version: 0.0.post1.dev8+gd983d5a.d20240720
PyTorch version: 2.3.1+cu121


In [2]:
# GPU checks
is_cuda = torch.cuda.is_available()
print(f'CUDA available: {is_cuda}')
print(f'Number of GPUs found:  {torch.cuda.device_count()}')

if is_cuda:
    print(f'Current device ID:     {torch.cuda.current_device()}')
    print(f'GPU device name:       {torch.cuda.get_device_name(0)}')
    print(f'CUDNN version:         {torch.backends.cudnn.version()}')
    device_str = 'cuda:0'
    torch.cuda.empty_cache() 
else:
    device_str = 'cpu'
device = torch.device(device_str)
print()
print(f'Device for model training/inference: {device}')

CUDA available: True
Number of GPUs found:  1
Current device ID:     0
GPU device name:       NVIDIA GeForce GTX 1080 with Max-Q Design
CUDNN version:         8902

Device for model training/inference: cuda:0


In [3]:
# Helper functions and parameters
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

# Directories
data_dir = os.path.join(os.environ.get('HOME'), 'data', 'transformers')

# Model checkpoint
checkpoint = 'bert-base-uncased'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint, device=device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
display(tokenizer)

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
display(tokenizer('hello world'))

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [7]:
input_str = 'hello world'
# Create tokens from input
tokens = tokenizer.tokenize(input_str)
print(tokens)
# Convert tokens to integet IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
# We can also convert the ids back to a list of tokens
print(tokenizer.convert_ids_to_tokens(ids))
# We can also create the text from the ids
print(tokenizer.decode(ids))
# Encode
ids = tokenizer.encode(input_str)
print(ids)
print(tokenizer.decode(ids))

['hello', 'world']
[7592, 2088]
['hello', 'world']
hello world
[101, 7592, 2088, 102]
[CLS] hello world [SEP]


In [8]:
# Create model inputs
model_inputs = tokenizer(input_str)
print(model_inputs)

# Tokenize multiple sentences at the same time
data = ['I like cats.',
        'Do you like cats too?']
print()
display(tokenizer(data))

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}



{'input_ids': [[101, 1045, 2066, 8870, 1012, 102], [101, 2079, 2017, 2066, 8870, 2205, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [9]:
# Instantiate the model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Create some outputs
model_inputs = tokenizer(input_str, return_tensors='pt')
display(model_inputs)

{'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [11]:
outputs = model(**model_inputs)
display(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.3404,  0.2037]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [12]:
# Create a new model with three outputs
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
outputs = model(**model_inputs)
display(outputs)
print()
print(outputs.get('logits').detach().cpu().numpy())

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.2144, -0.4391, -0.7003]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


[[ 0.21442679 -0.4390887  -0.7002971 ]]


In [14]:
# Tokenize multiple sentences at the same time
#data = ['I like cats.',
#        'Do you like cats too?']
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=checkpoint, device=device)

data = ['I like cats.',
        'Do you like cats too?']

model_inputs = tokenizer(data, padding=True, truncation=True, return_tensors='pt')

In [15]:
display(model_inputs.get('input_ids'))
display(model_inputs.get('attention_mask'))

tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0,    0],
        [ 101, 2079, 2017, 2066, 8870, 2205, 1029,  102]])

tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])

In [16]:
outputs = model(**model_inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0928, -0.4638, -0.7647],
        [ 0.0417, -0.4346, -0.7457]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
