### Hugging Face Transformers: Text Classification ###

In [5]:
import os
import glob
import numpy as np
import pandas as pd

# PyTorch packages
import torch

# Hugging Face
from transformers import pipeline

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import nlptools as nlpt
print(f'NLP Tools package version:  {nlpt.__version__}')
print(f'PyTorch version:            {torch.__version__}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
NLP Tools package version:  0.0.1b0.post1.dev8+g5ecd10c.d20241227
PyTorch version:            2.6.0a0+df5bbc09d1.nv24.11


In [3]:
# GPU checks
is_cuda = torch.cuda.is_available()
print(f'CUDA available: {is_cuda}')
print(f'Number of GPUs found:  {torch.cuda.device_count()}')

if is_cuda:
    print(f'Current device ID:     {torch.cuda.current_device()}')
    print(f'GPU device name:       {torch.cuda.get_device_name(0)}')
    print(f'CUDNN version:         {torch.backends.cudnn.version()}')
    device_str = 'cuda:0'
    torch.cuda.empty_cache() 
else:
    device_str = 'cpu'
device = torch.device(device_str)
print()
print(f'Device for model training/inference: {device}')

CUDA available: True
Number of GPUs found:  1
Current device ID:     0
GPU device name:       NVIDIA GeForce RTX 3060 Laptop GPU
CUDNN version:         90501

Device for model training/inference: cuda:0


In [8]:
classifier = pipeline('text-classification')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cuda:0


In [13]:
text = 'i love you'
outputs = classifier(text)
display(pd.DataFrame(outputs))

Unnamed: 0,label,score
0,POSITIVE,0.999866


In [17]:
# Let's pick a random new model
model = 'lxyuan/distilbert-base-multilingual-cased-sentiments-student'
classifier = pipeline(model=model, top_k=None)

Device set to use cuda:0


In [19]:
classifier(text)

[[{'label': 'positive', 'score': 0.9754830598831177},
  {'label': 'neutral', 'score': 0.01646627113223076},
  {'label': 'negative', 'score': 0.008050617761909962}]]