In [1]:
import transformers
import pandas as pd
import tensorflow as tf
from huggingface_hub import notebook_login
import os
transformers.logging.set_verbosity_error()

2022-04-24 08:52:28.271273: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-24 08:52:28.271292: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [None]:
notebook_login()

<h1 style="text-align:center;">A Whirlwind Tour of the 🤗 Hugging Face Ecosystem</h1>

<br><br><br><br>

<h4 style="text-align:center;"><b>Christopher Akiki</b></h3>

<br><br>
<p style="text-align:center;font-style: italic">Figures in these slides reproduced under the Apache License from <a href="https://www.oreilly.com/library/view/natural-language-processing/9781098103231/">Natural Language Processing with Transformers</a> published by O'Reilly Media, Inc.</p>

<center><a href="https://huggingface.co"><img src="images/chapter01_hf-ecosystem.png" width=800></a></center>

<center><img src="images/chapter02_hf-libraries.png" width=1800></center>

<h1 style="text-align:center;">🤗 Pipelines</h1>

<br><br>

In [None]:
from transformers import pipeline
from transformers.pipelines import get_supported_tasks

In [None]:
print(get_supported_tasks())


<center><img src="images/gewandhaus_review.png" width=900></center>

In [None]:
text = """One of the best orchestra in the world. I came to Leipzig\
            mainly to have one experience with Gewanhaus Leipzig Orchestra. 
            Under the baton of Maestro Andris Nelsons, Bruckner symphony #8 was so affection. 
            The acustic and layout of the concert hall is nice."""

# Sentiment Analysis

In [None]:
p = pipeline("text-classification", model='distilbert-base-uncased-finetuned-sst-2-english')

In [None]:
outputs = p(text)
outputs[0]

# Named-Entity Recognition

In [None]:
p = pipeline("ner", aggregation_strategy="simple", model="dbmdz/bert-large-cased-finetuned-conll03-english")

In [None]:
outputs = p(text)
pd.DataFrame(outputs)

# Question Answering

In [None]:
p = pipeline("question-answering", model="distilbert-base-cased-distilled-squad")

In [None]:
question = "Why did I visit Leipzig?"
outputs = p(question=question, context=text)
outputs

In [None]:
question = "What music did the orchestra play?"
outputs = p(question=question, context=text)
outputs

# Translation

In [None]:
p = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de")

In [None]:
outputs = p(text, clean_up_tokenization_spaces=True)
print(outputs[0]['translation_text'])

<h1 style="text-align:center;">🤗 Tokenizers</h1>

<center><img src="images/tokenization_pipeline.svg" width=1200></center>

In [24]:
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [3]:
print(nltk.corpus.gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [4]:
moby_dick_raw = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')

In [5]:
size = len(moby_dick_raw.encode())
print(f"{size/1024**2:.2f} MiB")

1.19 MiB


In [6]:
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

In [7]:
unk_token = "[UNK]"
pad_token = "[PAD]"
cls_token = "[CLS]" 
sep_token = "[SEP]"
mask_token = "[MASK]"
special_tokens = [unk_token, pad_token, cls_token, sep_token, mask_token]
vocab_size = 6_000

# WordPiece Tokenizer

In [8]:
custom_tokenizer = Tokenizer(WordPiece(unk_token=unk_token))

# Sequence of Normalizers

In [9]:
custom_normalizer = normalizers.Sequence(
            [normalizers.NFKD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

# Sequence of Pretokenizers

In [10]:
custom_pre_tokenizer = pre_tokenizers.Sequence(
            [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)

# WordPiece Trainer

In [11]:
custom_trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens, show_progress=False)

In [12]:
custom_tokenizer.normalizer = custom_normalizer
custom_tokenizer.pre_tokenizer = custom_pre_tokenizer

In [13]:
custom_tokenizer.train_from_iterator([moby_dick_raw], trainer=custom_trainer)

In [14]:
custom_tokenizer.get_vocab_size()

6000

In [None]:
encoding = custom_tokenizer.encode("Let us test this tokenizer")
print(encoding.tokens)

In [15]:
cls_token_id = custom_tokenizer.token_to_id(cls_token)
sep_token_id = custom_tokenizer.token_to_id(sep_token)

custom_post_processor = processors.TemplateProcessing(
    single=f"{cls_token}:0 $A:0 {sep_token}:0",
    pair=f"{cls_token}:0 $A:0 {sep_token}:0 $B:1 {sep_token}:1",
    special_tokens=[(cls_token, cls_token_id), (sep_token, sep_token_id)],
)

custom_tokenizer.post_processor = custom_post_processor

In [None]:
encoding = custom_tokenizer.encode("Let us test this tokenizer")
print(encoding.tokens)

In [None]:
encoding = custom_tokenizer.encode("This is the first sentence", "This is sentence number 2")
print(encoding.tokens)
print(encoding.ids)
print(encoding.type_ids)

# Using our custom tokenizer with a model

In [16]:
from transformers import PreTrainedTokenizerFast

model_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=custom_tokenizer,
    unk_token=unk_token,
    pad_token=pad_token,
    cls_token=cls_token,
    sep_token=sep_token,
    mask_token=mask_token,
)

In [None]:
text_batch = ["To be or not to be.", "It was the best of times.", "Call me Ishmael."]

In [None]:
model_tokenizer(text_batch, padding=True, return_tensors="tf")

In [2]:
# model_tokenizer.push_to_hub(repo_path_or_name="my_custom_tokenizer", 
#                             use_auth_token=True, use_temp_dir=True,
#                            commit_message="6_000")

<h1 style="text-align:center;">🤗 Datasets</h1>

In [8]:
from datasets import list_datasets, load_dataset

In [4]:
all_datasets = list_datasets()

In [6]:
len(all_datasets)

4393

In [7]:
[d for d in all_datasets if "emotion" in d]

['emotion',
 'go_emotions',
 'Mansooreh/sharif-emotional-speech-dataset',
 'Pyjay/emotion_nl',
 'SetFit/go_emotions',
 'SetFit/emotion',
 'jakeazcona/short-text-labeled-emotion-classification',
 'jakeazcona/short-text-multi-labeled-emotion-classification',
 'mrm8488/goemotions',
 'pariajm/sharif_emotional_speech_dataset',
 'rubrix/go_emotions_training',
 'rubrix/go_emotions_multi-label',
 'lewtun/autoevaluate__emotion',
 'stepp1/tweet_emotion_intensity',
 'lewtun/autoevaluate__go_emotions']

In [10]:
emotions = load_dataset("emotion")

Using custom data configuration default
Reusing dataset emotion (/home/jovyan/work/cache/HF/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [87]:
emotions['train'].info.description

'Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. For more detailed information please refer to the paper.\n'

In [89]:
print(emotions['train'].citation)

@inproceedings{saravia-etal-2018-carer,
    title = "{CARER}: Contextualized Affect Representations for Emotion Recognition",
    author = "Saravia, Elvis  and
      Liu, Hsien-Chi Toby  and
      Huang, Yen-Hao  and
      Wu, Junlin  and
      Chen, Yi-Shin",
    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
    month = oct # "-" # nov,
    year = "2018",
    address = "Brussels, Belgium",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D18-1404",
    doi = "10.18653/v1/D18-1404",
    pages = "3687--3697",
    abstract = "Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich struc

In [12]:
train_ds = emotions["train"]
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

In [34]:
train_ds.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}

In [37]:
train_ds.features['label'].int2str(0)

'sadness'

In [13]:
len(train_ds)

16000

In [14]:
train_ds[0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [15]:
train_ds[:10]

{'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy',
  'ive been feeling a little burdened lately wasnt sure why that was',
  'ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny',
  'i feel as confused about life as a teenager or as jaded as a year old man',
  'i have been with petronas for years i feel that petronas has performed well and made a huge profit',
  'i feel romantic too'],
 'label': [0, 0, 3, 2, 3, 0, 5, 4, 1, 2]}

In [16]:
train_ds[:10]['text']

['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'im grabbing a minute to post i feel greedy wrong',
 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
 'i am feeling grouchy',
 'ive been feeling a little burdened lately wasnt sure why that was',
 'ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny',
 'i feel as confused about life as a teenager or as jaded as a year old man',
 'i have been with petronas for years i feel that petronas has performed well and made a huge profit',
 'i feel romantic too']

In [58]:
def compute_tweet_length(row):
    return {"tweet_length": len(row['text'].split())}

In [95]:
train_ds = train_ds.map(compute_tweet_length, load_from_cache_file=False)

  0%|          | 0/16000 [00:00<?, ?ex/s]

In [64]:
train_ds.filter(lambda row: row['tweet_length'] < 25)

  0%|          | 0/16 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label', 'tweet_length'],
    num_rows: 11668
})

In [66]:
train_ds.sort("tweet_length")[:5]

Loading cached sorted indices for dataset at /home/jovyan/work/cache/HF/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-1ab627e5522b8733.arrow


{'text': ['no response', 'earth crake', 'one day', 'one night', 'at school'],
 'label': [3, 4, 0, 1, 3],
 'tweet_length': [2, 2, 2, 2, 2]}

In [69]:
def batched_compute_tweet_length(batch_of_rows):
    return {"tweet_length": [len(text.split()) for text in batch_of_rows['text']]}

In [72]:
train_ds.map(batched_compute_tweet_length, batched=True, batch_size=2000, load_from_cache_file=False)

  0%|          | 0/8 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label', 'tweet_length'],
    num_rows: 16000
})

In [80]:
%time train_ds.map(compute_tweet_length, load_from_cache_file=False)

  0%|          | 0/16000 [00:00<?, ?ex/s]

CPU times: user 1.01 s, sys: 134 ms, total: 1.15 s
Wall time: 998 ms


Dataset({
    features: ['text', 'label', 'tweet_length'],
    num_rows: 16000
})

In [79]:
%time train_ds.map(batched_compute_tweet_length, batched=True, batch_size=2000, load_from_cache_file=False, )

  0%|          | 0/8 [00:00<?, ?ba/s]

CPU times: user 109 ms, sys: 316 µs, total: 109 ms
Wall time: 104 ms


Dataset({
    features: ['text', 'label', 'tweet_length'],
    num_rows: 16000
})

In [91]:
train_ds.column_names

['text', 'label', 'tweet_length']

In [96]:
train_ds = train_ds.remove_columns('tweet_length')
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

<table><thead><tr><th align="center">Data format</th> <th align="center">Loading script</th> <th align="center">Example</th></tr></thead> <tbody><tr><td align="center">CSV &amp; TSV</td> <td align="center"><code>csv</code></td> <td align="center"><code>load_dataset("csv", data_files="my_file.csv")</code></td></tr> <tr><td align="center">Text files</td> <td align="center"><code>text</code></td> <td align="center"><code>load_dataset("text", data_files="my_file.txt")</code></td></tr> <tr><td align="center">JSON &amp; JSON Lines</td> <td align="center"><code>json</code></td> <td align="center"><code>load_dataset("json", data_files="my_file.jsonl")</code></td></tr> <tr><td align="center">Pickled DataFrames</td> <td align="center"><code>pandas</code></td> <td align="center"><code>load_dataset("pandas", data_files="my_dataframe.pkl")</code></td></tr></tbody></table>

In [17]:
import pandas as pd

In [38]:
emotions.set_format(type="pandas")
emotions_df = emotions['train'][:]

In [44]:
emotions_df['label_name'] = emotions_df['label'].apply(lambda x: train_ds.features['label'].int2str(x))

In [45]:
emotions_df

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger
...,...,...,...
15995,i just had a very brief time in the beanbag an...,0,sadness
15996,i am now turning and i feel pathetic that i am...,0,sadness
15997,i feel strong and good overall,1,joy
15998,i feel like this was such a rude comment and i...,3,anger


In [55]:
emotions_df['label_name'].value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: label_name, dtype: int64

In [51]:
emotions_df['text'].str.split().apply(len).describe()

count    16000.000000
mean        19.166313
std         10.986905
min          2.000000
25%         11.000000
50%         17.000000
75%         25.000000
max         66.000000
Name: text, dtype: float64

In [56]:
emotions.reset_format()

In [57]:
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

<h1 style="text-align:center;">🤗 Transformers</h1>

<center><img src="images/chapter04_bert-body-head.png" width=700></center>

In [97]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

<center><img src="images/chapter02_encoder-fine-tuning.png" width=700></center>

<center><img src="images/chapter02_encoder-feature-based.png" width=700></center>

<h1 style="text-align:center;">(Re)sources</h1>

- https://github.com/nlp-with-transformers/notebooks

- https://github.com/huggingface/course


<center><a href="https://www.oreilly.com/library/view/natural-language-processing/9781098103231/"><img src="images/book_cover.png" width=500></a></center>