# Tokenizer Fertility Comparison

Resources:
* [Tokenizer Summary](https://huggingface.co/docs/transformers/tokenizer_summary)
* [BBC Articles Dataset with Extra Features](https://www.kaggle.com/datasets/jacopoferretti/bbc-articles-dataset)
* [AutoTokenizer](https://huggingface.co/docs/transformers/v4.14.1/en/model_doc/auto#transformers.AutoTokenizer)

In [33]:
# !pip install transformers
# !pip install kagglehub
# ! pip install sentencepiece
from transformers import AutoTokenizer, BertTokenizer, XLNetTokenizer, DistilBertModel, AlbertTokenizer, AlbertModel, T5Tokenizer, T5Model
import pandas as pd
import numpy as np

In [34]:
def fertility(text, 
              tokenizer_class='BertTokenizer', 
              model='google-bert/bert-base-uncased'):

    tokenizer_class_ = getattr(__import__("transformers"), tokenizer_class, None)
    tokenizer = tokenizer_class_.from_pretrained(model)

    tokenized = tokenizer.tokenize(text) # Note: Transformers typically doesn't remove stopwords 
    num_words = len(text.split())

    fertility = len(tokenized) / num_words
    return fertility

In [35]:
df = pd.read_csv('bbc_news_text_complexity_summarization.csv')  # replace with your df
texts = df['text'][:100]

In [36]:
# Tokenizer Class: BertTokenizer
# Model: google-bert/bert-base-uncased

bert_fertilities = texts.apply(lambda text: fertility(text))

In [37]:
# Tokenizer Class: XLNetTokenizer
# Model: xlnet/xlnet-base-cased

tokenizer_class='XLNetTokenizer'
model='xlnet/xlnet-base-cased'
XLNet_fertilities = texts.apply(lambda text: fertility(text, tokenizer_class, model))

In [38]:
# Tokenizer Class: DistilBertTokenizer
# Model: distilbert-base-uncased

tokenizer_class='DistilBertTokenizer'
model='distilbert-base-uncased'
DistilBert_fertilities = texts.apply(lambda text: fertility(text, tokenizer_class, model))

In [39]:
# Tokenizer Class: AlbertTokenizer
# Model: 'albert-base-v2'

tokenizer_class='AlbertTokenizer'
model='albert-base-v2'
Albert_fertilities = texts.apply(lambda text: fertility(text, tokenizer_class, model))

In [40]:
# Tokenizer Class: T5Tokenizer
# Model: 't5-small'

tokenizer_class='AlbertTokenizer'
model='albert-base-v2'
T5_fertilities = texts.apply(lambda text: fertility(text, tokenizer_class, model))

In [41]:
tokenizer_fertilities = pd.DataFrame({'BERT': pd.Series(bert_fertilities),
                                      'XLNet': pd.Series(XLNet_fertilities),
                                      'DistilBert': pd.Series(DistilBert_fertilities),
                                      'Albert': pd.Series(Albert_fertilities),
                                      'T5': pd.Series(T5_fertilities)})
tokenizer_fertilities

Unnamed: 0,BERT,XLNet,DistilBert,Albert,T5
0,1.353919,1.339667,1.353919,1.363420,1.363420
1,1.231771,1.221354,1.231771,1.250000,1.250000
2,1.382576,1.329545,1.382576,1.363636,1.363636
3,1.364532,1.401478,1.364532,1.413793,1.413793
4,1.494340,1.566038,1.494340,1.520755,1.520755
...,...,...,...,...,...
95,1.228571,1.257143,1.228571,1.250000,1.250000
96,1.306306,1.326577,1.306306,1.333333,1.333333
97,1.368664,1.327189,1.368664,1.377880,1.377880
98,1.255814,1.258140,1.255814,1.276744,1.276744


In [42]:
tokenizer_fertilities.apply(lambda row: np.mean(row), axis=0)

BERT          1.297427
XLNet         1.305913
DistilBert    1.297427
Albert        1.316069
T5            1.316069
dtype: float64