In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import numpy as np
import json
from collections import Counter, defaultdict
from matplotlib import pyplot as plt

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import warnings
warnings.filterwarnings("ignore")


In [9]:
def print_encoding(model_inputs, indent=4):
    """
    Print the model inputs in a formatted way
    
    Args:
        model_inputs (dict): Dictionary containing model inputs
        indent (int, optional): Number of spaces for indentation. Defaults to 4.
    """
    indent_str = " " * indent
    print("{")
    for k, v in model_inputs.items():
        # Print each key-value pair with proper indentation
        print(indent_str + k + ":")
        print(indent_str + indent_str + str(v))
    print("}")

In [11]:
#Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('siebert/sentiment-roberta-large-english')
# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained('siebert/sentiment-roberta-large-english')


In [22]:
input_text = "I am excited to use Hugging Face's transformers library! that's awesome"
tokenized_input = tokenizer(input_text, return_tensors = 'pt')
print_encoding(tokenized_input)
output = model(**tokenized_input)
print(output)

labels = ['Negative','Positive']
pred = torch.argmax(output.logits, dim = -1)

print('tokenized input: ', tokenized_input)

print(f'Predicted label: {labels[pred]}')



{
    input_ids:
        tensor([[    0,   100,   524,  2283,     7,   304, 30581,  3923, 12346,    18,
          7891,   268,  5560,   328,    14,    18,  6344,     2]])
    attention_mask:
        tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
}
SequenceClassifierOutput(loss=None, logits=tensor([[-3.7820,  2.9508]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tokenized input:  {'input_ids': tensor([[    0,   100,   524,  2283,     7,   304, 30581,  3923, 12346,    18,
          7891,   268,  5560,   328,    14,    18,  6344,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Predicted label: Positive


In [24]:
name = "distilbert/distilbert-base-cased"


# name = "user/name" when loading from
# name = local_path when using save_pretrained() method

tokenizer = AutoTokenizer.from_pretrained(name) # convenient! Defaults to Fast
print(tokenizer)

DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


In [32]:
# This is how you call the tokenizer
input_str = "I love to play cricket"
tokenized_inputs = tokenizer(input_str) # https://huggingface.co/learn/nlp-course/en/chapter6/6


print("Vanilla Tokenization")
print_encoding(tokenized_inputs)
print()

# Two ways to access:
print(tokenized_inputs.input_ids)
print(tokenized_inputs["input_ids"])

Vanilla Tokenization
{
    input_ids:
        [101, 146, 1567, 1106, 1505, 5428, 102]
    attention_mask:
        [1, 1, 1, 1, 1, 1, 1]
}

[101, 146, 1567, 1106, 1505, 5428, 102]
[101, 146, 1567, 1106, 1505, 5428, 102]


In [33]:
cls = tokenizer.cls_token_id
sep = tokenizer.sep_token_id
pad = tokenizer.pad_token_id
print(cls, sep, pad)

#Tokenizer happen in a few steps
input_token = tokenizer.tokenize(input_str)
print(input_token)
input_ids = tokenizer.convert_tokens_to_ids(input_token)
print(input_ids)
input_ids_special_tokens = [cls] + input_ids + [sep]
print(input_ids_special_tokens)

decoded_str = tokenizer.decode(input_ids_special_tokens, skip_special_tokens=True)
print(decoded_str)

print("start:                ", input_str)
print("tokenize:             ", input_token)
print("convert_tokens_to_ids:", input_ids)
print("add special tokens:   ", input_ids_special_tokens)
print("--------")
print("decode:               ", decoded_str)

# NOTE that these steps don't create the attention mask or add the special characters

101 102 0
['I', 'love', 'to', 'play', 'cricket']
[146, 1567, 1106, 1505, 5428]
[101, 146, 1567, 1106, 1505, 5428, 102]
I love to play cricket
start:                 I love to play cricket
tokenize:              ['I', 'love', 'to', 'play', 'cricket']
convert_tokens_to_ids: [146, 1567, 1106, 1505, 5428]
add special tokens:    [101, 146, 1567, 1106, 1505, 5428, 102]
--------
decode:                I love to play cricket
