### Hugging Face Transformers: Tasks ###
Use Hugging Face models for NLP tasks
- Text classification
- Named entity recognition
- Question - answering
- Translation
- Text generation
- Image generation, modification
- Audio generation
- Video generation

In [1]:
import os
import glob
import numpy as np
import pandas as pd

# PyTorch packages
import torch
import torch.nn as nn

# Hugging Face
from transformers import pipeline
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import nlptools as nlpt
print(f'NLP Tools package version:  {nlpt.__version__}')
print(f'PyTorch version:            {torch.__version__}')

NLP Tools package version:  0.0.1b0.post1.dev8+g5ecd10c.d20241227
PyTorch version:            2.6.0a0+df5bbc09d1.nv24.11


In [2]:
# GPU checks
is_cuda = torch.cuda.is_available()
print(f'CUDA available: {is_cuda}')
print(f'Number of GPUs found:  {torch.cuda.device_count()}')

if is_cuda:
    print(f'Current device ID:     {torch.cuda.current_device()}')
    print(f'GPU device name:       {torch.cuda.get_device_name(0)}')
    print(f'CUDNN version:         {torch.backends.cudnn.version()}')
    device_str = 'cuda:0'
    torch.cuda.empty_cache() 
else:
    device_str = 'cpu'
device = torch.device(device_str)
print()
print(f'Device for model training/inference: {device}')

CUDA available: True
Number of GPUs found:  1
Current device ID:     0
GPU device name:       NVIDIA GeForce RTX 3060 Laptop GPU
CUDNN version:         90501

Device for model training/inference: cuda:0


### Text classification ###

In [9]:
# Default model: distilbert-base-uncased-finetuned-sst-2-english 
classifier = pipeline(task='text-classification')
text1 = 'I love you'
text2 = 'I hate you'
outputs = classifier(text2)
display(pd.DataFrame(outputs))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


Unnamed: 0,label,score
0,NEGATIVE,0.999113


In [22]:
# Use Hugging Face Instructions
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

inputs = tokenizer(text1, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

sm = nn.Softmax(dim=1)
softmax_logits = sm(logits).cpu().numpy()
print(softmax_logits)
print(model.config.id2label)

[[1.3436274e-04 9.9986565e-01]]
{0: 'NEGATIVE', 1: 'POSITIVE'}


In [34]:
# Use a different model with the pipeline
# tabularisai/multilingual-sentiment-analysis
# Number of Classes: 5 (Very Negative, Negative, Neutral, Positive, Very Positive)
model_name = 'tabularisai/multilingual-sentiment-analysis'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline(model = model_name)
text1 = 'das ist sehr gut'
text2 = 'ich mag diesen Film gar nicht'
outputs = classifier(text2)
display(pd.DataFrame(outputs))

Device set to use cuda:0


Unnamed: 0,label,score
0,LABEL_0,0.487179


In [32]:
print(model.config)

DistilBertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "tabularisai/multilingual-sentiment-analysis",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.47.1",
  "vocab_size": 119547
}

