In [28]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sn
from pprint import pprint
import textwrap

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# PyTorch
import torch

# Hugging Face 
from transformers import pipeline, set_seed

# Scikit-learn performance metrics
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, f1_score 
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import auc as calculate_auc

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import transformermodels as tm
print(f'Package version: {tm.__version__}')
print(f'PyTorch version: {torch.__version__}')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Package version: 0.0.post1.dev3+g8362a6c.d20240713
PyTorch version: 2.3.1+cu121


In [29]:
# GPU checks
is_cuda = torch.cuda.is_available()
print(f'CUDA available: {is_cuda}')
print(f'Number of GPUs found:  {torch.cuda.device_count()}')

if is_cuda:
    print(f'Current device ID:     {torch.cuda.current_device()}')
    print(f'GPU device name:       {torch.cuda.get_device_name(0)}')
    print(f'CUDNN version:         {torch.backends.cudnn.version()}')
    device_str = 'cuda:0'
    torch.cuda.empty_cache() 
else:
    device_str = 'cpu'
device = torch.device(device_str)
print()
print(f'Device for model training/inference: {device}')

CUDA available: True
Number of GPUs found:  1
Current device ID:     0
GPU device name:       NVIDIA GeForce RTX 3070 Laptop GPU
CUDNN version:         8902

Device for model training/inference: cuda:0


In [30]:
# Helper functions
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

# Directories
data_dir = os.path.join(os.environ.get('HOME'), 'data', 'transformers')
textdata = os.path.join(data_dir, 'bbc_text_cls.csv')

In [31]:
classifier = pipeline('zero-shot-classification', device=device)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [38]:
classifier('This is movie is ok', candidate_labels=['positive', 'kk', 'll', 'negative'])

{'sequence': 'This is movie is ok',
 'labels': ['positive', 'kk', 'll', 'negative'],
 'scores': [0.7718658447265625,
  0.15950368344783783,
  0.06362400203943253,
  0.0050064194947481155]}

In [39]:
# https://en.wikipedia.org/wiki/AMP-activated_protein_kinase
text = "Due to the presence of isoforms of its components, there are 12 " + \
  "versions of AMPK in mammals, each of which can have different tissue " + \
  "localizations, and different functions under different conditions. " + \
  "AMPK is regulated allosterically and by post-translational " + \
  "modification, which work together."
classifier(text, candidate_labels=["biology", "math", "geology"])

{'sequence': 'Due to the presence of isoforms of its components, there are 12 versions of AMPK in mammals, each of which can have different tissue localizations, and different functions under different conditions. AMPK is regulated allosterically and by post-translational modification, which work together.',
 'labels': ['biology', 'math', 'geology'],
 'scores': [0.8908601403236389, 0.06606564670801163, 0.043074119836091995]}