In [None]:
import transformers

transformers.logging.set_verbosity_error()

<h1 style="text-align:center;">A Whirlwind Tour of the ðŸ¤— Hugging Face Ecosystem</h1>

<br><br><br><br>

<h3 style="text-align:center;"><b>Christopher Akiki</b></p>

<center><img src="images/chapter01_hf-ecosystem.png" width=800></center>

<center><img src="images/chapter02_hf-libraries.png" width=1800></center>

<h1 style="text-align:center;">ðŸ¤— Pipelines</h1>


<center><img src="images/gewandhaus_review.png" width=900></center>

In [None]:
text = """One of the best orchestra in the world. I came to Leipzig\
            mainly to have one experience with Gewanhaus Leipzig Orchestra. 
            Under the baton of Maestro Andris Nelsons, Bruckner symphony #8 was so affection. 
            The acustic and layout of the concert hall is nice."""

In [None]:
from transformers import pipeline

classifier = pipeline("text-classification")

# Sentiment Analysis

In [None]:
outputs = classifier(text)
outputs[0]

# Named-Entity Recognition

In [None]:
import pandas as pd
ner_tagger = pipeline("ner", aggregation_strategy="simple")
outputs = ner_tagger("One of the best orchestra in the world. I came to Leipzig mainly to have one experience with Gewanhaus Leipzig Orchestra. Under the baton of Maestro Andris Nelsons, Bruckner symphony #8 was so affection. The acustic and layout of the concert hall is nice.")
pd.DataFrame(outputs)

# Question Answering

# Summarization

# Translation

# Text Generation

<h1 style="text-align:center;">ðŸ¤— Tokenizers</h1>

<center><img src="images/tokenization_pipeline.svg" width=1200></center>

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('gutenberg')
nltk.download('punkt')

In [None]:
print(nltk.corpus.gutenberg.fileids())

In [None]:
moby_dick_raw = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')
moby_dick_sentences = sent_tokenize(moby_dick, language='english')

In [None]:
len(moby_dick_sentences)

In [None]:
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

In [None]:
unk_token = "[UNK]"
pad_token = "[PAD]"
cls_token = "[CLS]" 
sep_token = "[SEP]"
mask_token = "[MASK]"
special_tokens = [unk_token, pad_token, cls_token, sep_token, mask_token]
vocab_size = 20_000

In [None]:
custom_tokenizer = Tokenizer(WordPiece(unk_token=unk_token))

In [None]:
custom_normalizer = normalizers.Sequence(
            [normalizers.NFKD(), normalizers.Lowercase(), normalizers.StripAccents()]
)

In [None]:
custom_pre_tokenizer = pre_tokenizers.Sequence(
            [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)

In [None]:
custom_trainer = WordPieceTrainer(vocab_size=vocab_size, special_tokens=special_tokens, show_progress=False)

In [None]:
custom_tokenizer.normalizer = custom_normalizer
custom_tokenizer.pre_tokenizer = custom_pre_tokenizer

In [None]:
custom_tokenizer.train_from_iterator(moby_dick_sentences, trainer=custom_trainer)

In [None]:
custom_tokenizer.get_vocab_size()

In [None]:
encoding = custom_tokenizer.encode("Let us test this tokenizer")
print(encoding.tokens)

In [None]:
cls_token_id = tokenizer.token_to_id(cls_token)
sep_token_id = tokenizer.token_to_id(sep_token)

custom_post_processor = processors.TemplateProcessing(
    single=f"{cls_token}:0 $A:0 {sep_token}:0",
    pair=f"{cls_token}:0 $A:0 {sep_token}:0 $B:1 {sep_token}:1",
    special_tokens=[(cls_token, cls_token_id), (sep_token, sep_token_id)],
)

custom_tokenizer.post_processor = custom_post_processor

In [None]:
encoding = custom_tokenizer.encode("Let us test this tokenizer")
print(encoding.tokens)

In [None]:
encoding = custom_tokenizer.encode("This is the first sentence", "This is sentence number 2")
print(encoding.tokens)
print(encoding.ids)
print(encoding.type_ids)

# Using our custom tokenizer with ðŸ¤— Transformers

In [None]:
from transformers import PreTrainedTokenizerFast

PreTrainedTokenizerFast


<h1 style="text-align:center;">ðŸ¤— Datasets</h1>

<table><thead><tr><th align="center">Data format</th> <th align="center">Loading script</th> <th align="center">Example</th></tr></thead> <tbody><tr><td align="center">CSV &amp; TSV</td> <td align="center"><code>csv</code></td> <td align="center"><code>load_dataset("csv", data_files="my_file.csv")</code></td></tr> <tr><td align="center">Text files</td> <td align="center"><code>text</code></td> <td align="center"><code>load_dataset("text", data_files="my_file.txt")</code></td></tr> <tr><td align="center">JSON &amp; JSON Lines</td> <td align="center"><code>json</code></td> <td align="center"><code>load_dataset("json", data_files="my_file.jsonl")</code></td></tr> <tr><td align="center">Pickled DataFrames</td> <td align="center"><code>pandas</code></td> <td align="center"><code>load_dataset("pandas", data_files="my_dataframe.pkl")</code></td></tr></tbody></table>

<h1 style="text-align:center;">ðŸ¤— Transformers</h1>

<h1 style="text-align:center;">Case-study: ðŸ“œ Scientific Paper Retrieval</h1>

<h1 style="text-align:center;">(Re)sources</h1>

- https://github.com/nlp-with-transformers/notebooks

- https://github.com/huggingface/course


<center><a href="https://www.oreilly.com/library/view/natural-language-processing/9781098103231/"><img src="images/book_cover.png" width=500></a></center>