## HuggingFace Transformers

In [1]:
# Import the pipeline function from HuggingFace Transformers library
# Pipeline provides a simple API to use pre-trained models for various NLP tasks
from transformers import pipeline

In [3]:
# Create a sentiment analysis pipeline using the default pre-trained model
# This will automatically download and load the model for classifying text sentiment (positive/negative)
sentiment_classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [4]:
# Use the sentiment classifier to analyze the sentiment of the input text
# Returns a label (POSITIVE/NEGATIVE) and a confidence score
sentiment_classifier("I'm so excited to be learning about large language models")

[{'label': 'POSITIVE', 'score': 0.9997096657752991}]

In [5]:
# Create a Named Entity Recognition (NER) pipeline with a specific BERT model
# NER identifies and classifies named entities (people, organizations, locations) in text
ner = pipeline("ner", model = "dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [6]:
# Apply NER to extract named entities from the text
# Will identify 'Anna' (person), 'New York City' (location), and 'Morgan Stanley' (organization)
ner("Her name is Anna and she works in New York City for Morgan Stanley")

[{'entity': 'B-PER',
  'score': 0.9954881,
  'index': 4,
  'word': 'Anna',
  'start': 12,
  'end': 16},
 {'entity': 'B-LOC',
  'score': 0.99960667,
  'index': 9,
  'word': 'New',
  'start': 34,
  'end': 37},
 {'entity': 'I-LOC',
  'score': 0.9993955,
  'index': 10,
  'word': 'York',
  'start': 38,
  'end': 42},
 {'entity': 'I-LOC',
  'score': 0.9995803,
  'index': 11,
  'word': 'City',
  'start': 43,
  'end': 47},
 {'entity': 'B-ORG',
  'score': 0.9957462,
  'index': 13,
  'word': 'Morgan',
  'start': 52,
  'end': 58},
 {'entity': 'I-ORG',
  'score': 0.9979346,
  'index': 14,
  'word': 'Stanley',
  'start': 59,
  'end': 66}]

In [7]:
# Create a zero-shot classification pipeline using Facebook's BART model
# Zero-shot classification can categorize text without being explicitly trained on those categories
zeroshot_classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")

Device set to use mps:0


In [8]:
# Define the text to classify and the candidate labels
# The model will determine which label best matches the text's meaning
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'cooking', 'dancing']

In [9]:
# Classify the sequence against the candidate labels
# Returns scores for each label showing how well they match the text
zeroshot_classifier(sequence_to_classify, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'dancing', 'cooking'],
 'scores': [0.9938651323318481, 0.0032737720757722855, 0.002861043205484748]}

## Pre-trained Tokenizers

In [10]:
# Import AutoTokenizer to automatically load the appropriate tokenizer for any model
# Tokenizers convert text into numerical representations that models can process
from transformers import AutoTokenizer

In [11]:
# Specify the model name - using BERT base model (uncased means it doesn't distinguish uppercase/lowercase)
model = "bert-base-uncased"

In [12]:
# Load the pre-trained tokenizer for the specified BERT model
# This downloads the tokenizer configuration and vocabulary from HuggingFace
tokenizer = AutoTokenizer.from_pretrained(model)

In [13]:
# Define a sample sentence to demonstrate tokenization
sentence = "I'm so excited to be learning about large language models"

In [14]:
# Tokenize the sentence - converts text to token IDs with attention masks
# Returns a dictionary with 'input_ids' and 'attention_mask'
input_ids = tokenizer(sentence)
print(input_ids)

{'input_ids': [101, 1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [15]:
# Break the sentence into individual tokens (subword units)
# BERT uses WordPiece tokenization which splits words into subwords when needed
tokens = tokenizer.tokenize(sentence)

In [16]:
# Display the list of tokens to see how the text was split
print(tokens)

['i', "'", 'm', 'so', 'excited', 'to', 'be', 'learning', 'about', 'large', 'language', 'models']


In [17]:
# Convert each token to its corresponding numerical ID from the vocabulary
# These IDs are what the model actually processes
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [18]:
# Display the numerical token IDs
print(token_ids)

[1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653, 4275]


In [19]:
# Decode the token IDs back to text to verify the tokenization process
# This should reconstruct something close to the original sentence
decoded_ids = tokenizer.decode(token_ids)
print(decoded_ids)

i ' m so excited to be learning about large language models


In [20]:
# Decode token ID 101 - this is the [CLS] (classification) special token in BERT
# [CLS] is added at the start of every sequence
tokenizer.decode(101)

'[CLS]'

In [21]:
# Decode token ID 102 - this is the [SEP] (separator) special token in BERT
# [SEP] is added at the end of sequences and between sentence pairs
tokenizer.decode(102)

'[SEP]'

In [22]:
# Try a different model - XLNet (cased version preserves uppercase/lowercase)
# Different models use different tokenization strategies and vocabularies
model2 = "xlnet-base-cased"

In [23]:
# Load the XLNet tokenizer to compare with BERT's tokenization
tokenizer2 = AutoTokenizer.from_pretrained(model2)

In [24]:
# Tokenize the same sentence with XLNet tokenizer
input_ids = tokenizer2(sentence)

In [25]:
# Display XLNet tokenization output to compare with BERT
print(input_ids)

{'input_ids': [35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [26]:
# Get individual tokens from XLNet tokenizer
# Notice how XLNet's SentencePiece tokenization differs from BERT's WordPiece
tokens = tokenizer2.tokenize(sentence)
print(tokens)

['▁I', "'", 'm', '▁so', '▁excited', '▁to', '▁be', '▁learning', '▁about', '▁large', '▁language', '▁models']


In [27]:
# Convert XLNet tokens to their numerical IDs
# XLNet uses a different vocabulary than BERT, so IDs will be different
token_ids = tokenizer2.convert_tokens_to_ids(tokens)
print(token_ids)

[35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626]


In [28]:
# Decode a special token ID from XLNet's vocabulary
tokenizer2.decode(4)

'<sep>'

In [29]:
# Decode another special token ID from XLNet's vocabulary
tokenizer2.decode(3)

'<cls>'

## Huggingface and Pytorch/Tensorflow

In [30]:
# Import AutoModelForSequenceClassification to load pre-trained classification models
# Also import PyTorch for tensor operations and model inference
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [31]:
# Review the sentence and input IDs we'll use for classification
print(sentence)
print(input_ids)

I'm so excited to be learning about large language models
{'input_ids': [35, 26, 98, 102, 5564, 22, 39, 1899, 75, 392, 1243, 2626, 4, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [32]:
# Load a DistilBERT tokenizer that's fine-tuned for sentiment analysis on SST-2 dataset
# DistilBERT is a smaller, faster version of BERT with similar performance
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [33]:
# Tokenize and convert to PyTorch tensors (return_tensors="pt")
# PyTorch tensors are required for feeding data into PyTorch models
input_ids_pt = tokenizer(sentence, return_tensors ="pt")
print(input_ids_pt)

{'input_ids': tensor([[ 101, 1045, 1005, 1049, 2061, 7568, 2000, 2022, 4083, 2055, 2312, 2653,
         4275,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [34]:
# Load the pre-trained DistilBERT model for sequence classification
# This model is specifically fine-tuned for binary sentiment classification (positive/negative)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [35]:
# Run inference without gradient computation (saves memory and speeds up prediction)
# Get logits (raw prediction scores) from the model, then find the class with the highest score
with torch.no_grad():
    logits = model(**input_ids_pt).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'POSITIVE'

## Saving and loading models

In [36]:
# Define directory path where we'll save the model and tokenizer locally
model_directory = "my_saved_models"

In [37]:
# Save the tokenizer configuration and vocabulary to the specified directory
# This allows you to use the tokenizer offline without re-downloading
tokenizer.save_pretrained(model_directory)

('my_saved_models/tokenizer_config.json',
 'my_saved_models/special_tokens_map.json',
 'my_saved_models/vocab.txt',
 'my_saved_models/added_tokens.json',
 'my_saved_models/tokenizer.json')

In [38]:
# Save the model weights and configuration to the specified directory
# This creates a local copy of the model for offline use or deployment
model.save_pretrained(model_directory)

In [39]:
# Load the tokenizer from the local directory instead of HuggingFace hub
# This demonstrates how to load saved models for offline use
my_tokenizer = AutoTokenizer.from_pretrained(model_directory)

In [40]:
# Load the model from the local directory
# Now you can use this model without an internet connection
my_model = AutoModelForSequenceClassification.from_pretrained(model_directory)