In [None]:
import transformers
import pandas as pd
import tensorflow as tf
from huggingface_hub import notebook_login
import os
transformers.logging.set_verbosity_error()

In [None]:
notebook_login()

<h1 style="text-align:center;">A Whirlwind Tour of the 🤗 Hugging Face Ecosystem</h1>

<br><br><br><br>

<h4 style="text-align:center;"><b>Christopher Akiki</b></h4>

<br><br>
<p style="text-align:center;font-style: italic">Figures in these slides reproduced under the Apache License from <a href="https://www.oreilly.com/library/view/natural-language-processing/9781098103231/">Natural Language Processing with Transformers</a> published by O'Reilly Media, Inc.</p>

<center><a href="https://huggingface.co"><img src="images/chapter01_hf-ecosystem.png" width=800></a></center>

<center><img src="images/chapter02_hf-libraries.png" width=1800></center>

<h1 style="text-align:center;">🤗 Pipelines</h1>

<br><br>

In [None]:
from transformers import pipeline
from transformers.pipelines import get_supported_tasks

In [None]:
print(get_supported_tasks())


<center><img src="images/gewandhaus_review.png" width=900></center>

In [None]:
text = """One of the best orchestra in the world. I came to Leipzig\
            mainly to have one experience with Gewanhaus Leipzig Orchestra. 
            Under the baton of Maestro Andris Nelsons, Bruckner symphony #8 was so affection. 
            The acustic and layout of the concert hall is nice."""

# Sentiment Analysis

In [None]:
p = pipeline("text-classification", 
             model='distilbert-base-uncased-finetuned-sst-2-english', device=-1)

In [None]:
outputs = p(text)
outputs[0]

# Named-Entity Recognition

In [None]:
p = pipeline("ner", aggregation_strategy="simple", model="dbmdz/bert-large-cased-finetuned-conll03-english", 
             device=-1)

In [None]:
outputs = p(text)
pd.DataFrame(outputs)

# Question Answering

In [None]:
p = pipeline("question-answering", model="distilbert-base-cased-distilled-squad", device=-1)

In [None]:
questions = ['What city did I visit?', 
             'Why did I visit Leipzig?',
             'What music did the orchestra play?',
             'Who lead the orchestra?']

In [None]:
outputs = p(question=questions, context=text)
with pd.option_context('display.max_colwidth', -1):
    display(pd.DataFrame(zip(questions, [o['answer'] for o in outputs]), columns=['Question', 'Answer']))

# Translation

In [None]:
p = pipeline("translation_en_to_de", 
             model="Helsinki-NLP/opus-mt-en-de", device=-1)

In [None]:
outputs = p(text, clean_up_tokenization_spaces=True)
print(outputs[0]['translation_text'])

<h1 style="text-align:center;">🤗 Tokenizers</h1>

<center><img src="images/tokenization_pipeline.svg" width=1200></center>

In [None]:
import nltk
nltk.download('gutenberg')

In [None]:
print(nltk.corpus.gutenberg.fileids())

In [None]:
moby_dick_raw = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')

In [None]:
size = len(moby_dick_raw.encode())
print(f"{size/1024**2:.2f} MiB")

In [None]:
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

In [None]:
unk_token = "[UNK]"
pad_token = "[PAD]"
cls_token = "[CLS]" 
sep_token = "[SEP]"
mask_token = "[MASK]"
special_tokens = [unk_token, pad_token, cls_token, sep_token, mask_token]
vocab_size = 6_000

# WordPiece Tokenizer

In [None]:
custom_tokenizer = Tokenizer(WordPiece(unk_token=unk_token))

# Sequence of Normalizers

In [None]:
custom_normalizer = normalizers.Sequence(
            [normalizers.NFKD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

# Sequence of Pretokenizers

In [None]:
custom_pre_tokenizer = pre_tokenizers.Sequence(
            [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)

# WordPiece Trainer

In [None]:
custom_trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens, show_progress=False)

In [None]:
custom_tokenizer.normalizer = custom_normalizer
custom_tokenizer.pre_tokenizer = custom_pre_tokenizer

In [None]:
%%time
custom_tokenizer.train_from_iterator([moby_dick_raw], trainer=custom_trainer)

In [None]:
custom_tokenizer.get_vocab_size()

In [None]:
encoding = custom_tokenizer.encode("Let us test this tokenizer")
print(encoding.tokens)

In [None]:
cls_token_id = custom_tokenizer.token_to_id(cls_token)
sep_token_id = custom_tokenizer.token_to_id(sep_token)

custom_post_processor = processors.TemplateProcessing(
    single=f"{cls_token}:0 $A:0 {sep_token}:0",
    pair=f"{cls_token}:0 $A:0 {sep_token}:0 $B:1 {sep_token}:1",
    special_tokens=[(cls_token, cls_token_id), (sep_token, sep_token_id)],
)

custom_tokenizer.post_processor = custom_post_processor

In [None]:
encoding = custom_tokenizer.encode("Let us test this tokenizer")
print(encoding.tokens)

In [None]:
encoding = custom_tokenizer.encode("This is the first sentence", "This is sentence number 2")
print(encoding.tokens)
print(encoding.ids)
print(encoding.type_ids)

# Using our custom tokenizer with a model

In [None]:
from transformers import PreTrainedTokenizerFast

model_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=custom_tokenizer,
    unk_token=unk_token,
    pad_token=pad_token,
    cls_token=cls_token,
    sep_token=sep_token,
    mask_token=mask_token,
)

In [None]:
text_batch = ["To be or not to be.", "It was the best of times.", "Call me Ishmael."]

In [None]:
model_tokenizer(text_batch, padding=True, return_tensors="tf")

<h1 style="text-align:center;">🤗 Datasets</h1>

# Apache Arrow backend ➡️ Low RAM use

<br>

```python
import os; import psutil; import timeit
from datasets import load_dataset

mem_before = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
wiki = load_dataset("wikipedia", "20220301.en", split="train")
mem_after = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

print(f"RAM memory used: {(mem_after - mem_before)} MB")

*****RAM memory used: 50 MB*****
```


# Apache Arrow Backend ➡️ Fast Iteration
<br>

```python
s = """batch_size = 1000
for i in range(0, len(wiki), batch_size):
    batch = wiki[i:i + batch_size]
"""
time = timeit.timeit(stmt=s, number=1, globals=globals())
print(f"Time to iterate over the {wiki.dataset_size >> 30}GB dataset: {time:.1f} sec, "
      f"ie. {float(wiki.dataset_size >> 27)/time:.1f} Gb/s")

*****Time to iterate over the 18 GB dataset: 70.5 sec, ie. 2.1 Gb/s*****
```

In [None]:
from datasets import list_datasets, load_dataset

In [None]:
all_datasets = list_datasets()

In [None]:
len(all_datasets)

In [None]:
[d for d in all_datasets if "emotion" in d]

In [None]:
emotions = load_dataset("emotion")
emotions

In [None]:
print(emotions['train'].info.description)
print(125*"*")
print(emotions['train'].citation)

In [None]:
train_ds = emotions["train"]
train_ds

In [None]:
train_ds.features['label']

In [None]:
train_ds.features['label'].int2str(5)

In [None]:
len(train_ds)

In [None]:
train_ds[11]

In [None]:
train_ds[:10]

In [None]:
train_ds[:10]['text']

In [None]:
def compute_tweet_length(row):
    return {"tweet_length": len(row['text'].split())}

In [None]:
train_ds = train_ds.map(compute_tweet_length, load_from_cache_file=False)

In [None]:
train_ds.push_to_hub('emotion-with-length')

In [None]:
train_ds.sort("tweet_length")[:10]

In [None]:
def batched_compute_tweet_length(batch_of_rows):
    return {"tweet_length": [len(text.split()) for text in batch_of_rows['text']]}

In [None]:
# train_ds.map(batched_compute_tweet_length, batched=True, batch_size=2000, load_from_cache_file=False)

In [None]:
%time train_ds.map(compute_tweet_length, load_from_cache_file=False)

In [None]:
%time train_ds.map(batched_compute_tweet_length, batched=True, batch_size=2000, load_from_cache_file=False)

In [None]:
train_ds.column_names

In [None]:
train_ds = train_ds.remove_columns('tweet_length')
train_ds

# Loading your own files

<table><thead><tr><th align="center">Data format</th> <th align="center">Loading script</th> <th align="center">Example</th></tr></thead> <tbody><tr><td align="center">CSV &amp; TSV</td> <td align="center"><code>csv</code></td> <td align="center"><code>load_dataset("csv", data_files="my_file.csv")</code></td></tr> <tr><td align="center">Text files</td> <td align="center"><code>text</code></td> <td align="center"><code>load_dataset("text", data_files="my_file.txt")</code></td></tr> <tr><td align="center">JSON &amp; JSON Lines</td> <td align="center"><code>json</code></td> <td align="center"><code>load_dataset("json", data_files="my_file.jsonl")</code></td></tr> <tr><td align="center">Pickled DataFrames</td> <td align="center"><code>pandas</code></td> <td align="center"><code>load_dataset("pandas", data_files="my_dataframe.pkl")</code></td></tr></tbody></table>

In [None]:
import pandas as pd

In [None]:
emotions.set_format(type="pandas")
emotions_df = emotions['train'][:]

In [None]:
emotions_df['label_name'] = emotions_df['label'].apply(lambda x: train_ds.features['label'].int2str(x))

In [None]:
emotions_df['text'].str.split().apply(len).describe()

In [None]:
emotions.reset_format()

In [None]:
train_ds

<h1 style="text-align:center;">🤗 Transformers</h1>

<center><img src="images/chapter03_transformers-compact.png" width=500></center>

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, DataCollatorWithPadding

model_checkpoint = "distilbert-base-uncased"

<br><center><img src="images/chapter04_bert-body-head.png" width=600></center>

# Transfer Learning via Feature Extraction (Homework)

<center><img src="images/chapter02_encoder-feature-based.png" width=1000></center>

# Transfer Learning via Finetuning

<center><img src="images/chapter02_encoder-fine-tuning.png" width=1000></center>

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer(["This is a test", "This is another test", "cat"], return_tensors="tf", padding=True, truncation=True)

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [None]:
tokenized_train_ds = train_ds.map(tokenize, batched=True, batch_size=None)
tokenized_val_ds = emotions['validation'].map(tokenize, batched=True, batch_size=None)
tokenizer_test_ds = emotions['test'].map(tokenize, batched=True, batch_size=None)

In [None]:
tokenizer.model_input_names

In [None]:
BATCH_SIZE = 64
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_dataset = tokenized_train_ds.to_tf_dataset(columns=tokenizer.model_input_names, 
                                                    label_cols=['label'], shuffle=True, batch_size=BATCH_SIZE,
                                                    collate_fn=data_collator
                                                   )

tf_val_dataset = tokenized_val_ds.to_tf_dataset(columns=tokenizer.model_input_names, 
                                                    label_cols=['label'], shuffle=False, batch_size=BATCH_SIZE,
                                                    collate_fn=data_collator
                                                   )

tf_test_dataset = tokenizer_test_ds.to_tf_dataset(columns=tokenizer.model_input_names, 
                                                    label_cols=['label'], shuffle=False, batch_size=BATCH_SIZE,
                                                    collate_fn=data_collator
                                                   )

In [None]:
for i in tf_train_dataset.take(1):
    print(i)

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained(model_checkpoint, 
                                                              num_labels=train_ds.features['label'].num_classes)

In [None]:
%load_ext tensorboard
%tensorboard --logdir /tf/model/logs --host 0.0.0.0

In [None]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard

tensorboard_callback = TensorBoard(log_dir="./model/logs")

push_to_hub_callback = PushToHubCallback(
    output_dir="./model",
    tokenizer=tokenizer,
    hub_model_id=f"{model_checkpoint}-finetuned-tweet-sentiment",
)

callbacks = [tensorboard_callback, push_to_hub_callback]

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=tf.metrics.SparseCategoricalAccuracy()
)

In [None]:
history = model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=3, callbacks=callbacks)

In [None]:
 _, accuracy = model.evaluate(tf_test_dataset)
print(f"Test accuracy: {round(accuracy * 100, 2)}%")

In [2]:
from transformers import pipeline

In [3]:
p = pipeline("text-classification", model='cakiki/distilbert-base-uncased-finetuned-tweet-sentiment', device=-1)

KeyboardInterrupt: 

In [None]:
import gradio as gr

gr.Interface.load("huggingface/cakiki/distilbert-base-uncased-finetuned-tweet-sentiment").launch(share=True);

<h1 style="text-align:center;">(Re)sources</h1>

- https://github.com/nlp-with-transformers/notebooks

- https://huggingface.co/docs

- https://github.com/huggingface/course / https://github.com/huggingface/notebooks

- https://github.com/NielsRogge/Transformers-Tutorials

<center><a href="https://www.oreilly.com/library/view/natural-language-processing/9781098103231/"><img src="images/book_cover.png" width=400></a></center>