## different tokenizers

In [None]:
import re
import nltk
import spacy

# Download necessary NLTK data
nltk.download('punkt')

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "Hello, world! This is a test sentence. Let's see how different tokenizers handle it."

# Whitespace Tokenizer
def whitespace_tokenizer(text):
    return text.split()

# Punctuation-based Tokenizer
def punctuation_tokenizer(text):
    return re.findall(r'\w+|[^\w\s]', text, re.UNICODE)

# NLTK Tokenizer
def nltk_tokenizer(text):
    return nltk.word_tokenize(text)

# SpaCy Tokenizer
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.text for token in doc]

# Comparing Tokenizers
tokenizers = {
    "Whitespace Tokenizer": whitespace_tokenizer,
    "Punctuation Tokenizer": punctuation_tokenizer,
    "NLTK Tokenizer": nltk_tokenizer,
    "SpaCy Tokenizer": spacy_tokenizer
}

# Apply each tokenizer to the sample text
tokenized_texts = {name: tokenizer(text) for name, tokenizer in tokenizers.items()}

# Display the results
for name, tokens in tokenized_texts.items():
    print(f"{name}:\n{tokens}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Whitespace Tokenizer:
['Hello,', 'world!', 'This', 'is', 'a', 'test', 'sentence.', "Let's", 'see', 'how', 'different', 'tokenizers', 'handle', 'it.']

Punctuation Tokenizer:
['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', 'sentence', '.', 'Let', "'", 's', 'see', 'how', 'different', 'tokenizers', 'handle', 'it', '.']

NLTK Tokenizer:
['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', 'sentence', '.', 'Let', "'s", 'see', 'how', 'different', 'tokenizers', 'handle', 'it', '.']

SpaCy Tokenizer:
['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', 'sentence', '.', 'Let', "'s", 'see', 'how', 'different', 'tokenizers', 'handle', 'it', '.']



## Word and Subword Tokenization

BPE Tokenizer: This tokenizer uses Byte Pair Encoding, which merges the most frequent pairs of bytes in the text iteratively. It's effective for subword tokenization and handles out-of-vocabulary words by breaking them into subwords.

WordPiece Tokenizer: This tokenizer uses a similar approach to BPE but is optimized for handling word pieces, especially useful in models like BERT. It breaks down words into the most frequent subword units.

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece
from tokenizers.trainers import BpeTrainer, WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

# Sample text
texts = [
    "Hello, world! This is a test sentence.",
    "Let's see how different tokenizers handle out-of-vocabulary words like spacy, tensorflow, and huggingface."
]

# Initialize BPE Tokenizer
bpe_tokenizer = Tokenizer(BPE())
bpe_tokenizer.pre_tokenizer = Whitespace()
bpe_trainer = BpeTrainer(vocab_size=1000, min_frequency=2)
bpe_tokenizer.train_from_iterator(texts, trainer=bpe_trainer)

# Initialize WordPiece Tokenizer
wordpiece_tokenizer = Tokenizer(WordPiece())
wordpiece_tokenizer.pre_tokenizer = Whitespace()
wordpiece_trainer = WordPieceTrainer(vocab_size=1000, min_frequency=2)
wordpiece_tokenizer.train_from_iterator(texts, trainer=wordpiece_trainer)

# Function to display tokenized results
def display_tokenization_results(tokenizer, texts):
    for i, text in enumerate(texts):
        tokens = tokenizer.encode(text).tokens
        print(f"Text {i+1}: {text}")
        print(f"Tokens: {tokens}\n")

# Display BPE Tokenization Results
print("BPE Tokenization Results:")
display_tokenization_results(bpe_tokenizer, texts)

# Display WordPiece Tokenization Results
print("WordPiece Tokenization Results:")
display_tokenization_results(wordpiece_tokenizer, texts)


BPE Tokenization Results:
Text 1: Hello, world! This is a test sentence.
Tokens: ['H', 'e', 'l', 'lo', ',', 'wor', 'l', 'd', '!', 'T', 'h', 'is', 'is', 'a', 't', 'e', 's', 't', 's', 'en', 'ten', 'c', 'e', '.']

Text 2: Let's see how different tokenizers handle out-of-vocabulary words like spacy, tensorflow, and huggingface.
Tokens: ['L', 'e', 't', "'", 's', 's', 'e', 'e', 'h', 'o', 'w', 'd', 'i', 'f', 'f', 'er', 'en', 't', 't', 'o', 'k', 'en', 'i', 'z', 'er', 's', 'h', 'and', 'l', 'e', 'o', 'u', 't', '-', 'o', 'f', '-', 'v', 'o', 'c', 'a', 'b', 'u', 'l', 'a', 'r', 'y', 'wor', 'd', 's', 'l', 'i', 'k', 'e', 's', 'p', 'ac', 'y', ',', 'ten', 's', 'or', 'f', 'lo', 'w', ',', 'and', 'h', 'u', 'g', 'g', 'i', 'n', 'g', 'f', 'ac', 'e', '.']

WordPiece Tokenization Results:
Text 1: Hello, world! This is a test sentence.
Tokens: ['H', '##e', '##l', '##l', '##o', ',', 'wor', '##l', '##d', '!', 'T', '##h', '##i', '##s', 'i', '##s', 'a', 't', '##e', '##s', '##t', 's', '##ent', '##en', '##c', '##e', '

## Multilingual Tokenizer

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

# Sample texts in different languages
texts = [
    "Hello, world! This is a test sentence.",
    "Hola, mundo! Esta es una frase de prueba.",
    "Bonjour le monde! Ceci est une phrase de test.",
    "你好，世界！这是一个测试句子。",
    "こんにちは、世界！これはテスト文です。",
    "안녕하세요, 세계! 이것은 테스트 문장입니다."
]

# Initialize WordPiece Tokenizer
tokenizer = Tokenizer(WordPiece())
tokenizer.pre_tokenizer = Whitespace()

# Train the tokenizer on multilingual text
trainer = WordPieceTrainer(vocab_size=1000, min_frequency=2)
tokenizer.train_from_iterator(texts, trainer=trainer)

# Function to display tokenized results
def display_tokenization_results(tokenizer, texts):
    for i, text in enumerate(texts):
        tokens = tokenizer.encode(text).tokens
        print(f"Text {i+1}: {text}")
        print(f"Tokens: {tokens}\n")

# Display Tokenization Results
print("Multilingual Tokenization Results:")
display_tokenization_results(tokenizer, texts)


Multilingual Tokenization Results:
Text 1: Hello, world! This is a test sentence.
Tokens: ['H', '##e', '##l', '##l', '##o', ',', 'w', '##o', '##r', '##l', '##d', '!', 'T', '##h', '##i', '##s', 'i', '##s', 'a', 'test', 's', '##en', '##t', '##en', '##c', '##e', '.']

Text 2: Hola, mundo! Esta es una frase de prueba.
Tokens: ['H', '##o', '##l', '##a', ',', 'm', '##u', '##nd', '##o', '!', 'E', '##st', '##a', 'e', '##s', 'un', '##a', 'f', '##rase', 'de', 'p', '##r', '##u', '##e', '##b', '##a', '.']

Text 3: Bonjour le monde! Ceci est une phrase de test.
Tokens: ['B', '##o', '##n', '##j', '##o', '##u', '##r', 'l', '##e', 'm', '##o', '##nd', '##e', '!', 'C', '##e', '##c', '##i', 'e', '##st', 'un', '##e', 'p', '##h', '##rase', 'de', 'test', '.']

Text 4: 你好，世界！这是一个测试句子。
Tokens: ['你', '##好', '，', '世界', '！', '这', '##是', '##一', '##个', '##测', '##试', '##句', '##子', '。']

Text 5: こんにちは、世界！これはテスト文です。
Tokens: ['こ', '##ん', '##に', '##ち', '##は', '、', '世界', '！', 'こ', '##れ', '##は', '##テ', '##ス', '##ト', '##文

## Tokenization and NER

In [None]:
import torch
from transformers import BertTokenizer, BertForTokenClassification, pipeline
import spacy
import nltk
import re

# Load NER model and tokenizer from Hugging Face
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Load SpaCy model
spacy_nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Elon Musk is the CEO of SpaceX, and he lives in Texas."

# Tokenizers
def whitespace_tokenizer(text):
    return text.split()

def punctuation_tokenizer(text):
    return re.findall(r'\w+|[^\w\s]', text, re.UNICODE)

def nltk_tokenizer(text):
    return nltk.word_tokenize(text)

def spacy_tokenizer(text):
    doc = spacy_nlp(text)
    return [token.text for token in doc]

# Function to run NER
def run_ner(text):
    ner_results = ner_pipeline(text)
    return [(result['word'], result['entity']) for result in ner_results]

# Function to tokenize and run NER
def tokenize_and_ner(tokenizer_func, text):
    tokens = tokenizer_func(text)
    tokenized_text = ' '.join(tokens)
    print(f"Tokenized Text: {tokenized_text}")
    ner_results = run_ner(tokenized_text)
    return ner_results

# Apply tokenizers and run NER
print("Whitespace Tokenizer NER Results:")
print(tokenize_and_ner(whitespace_tokenizer, text))

print("\nPunctuation Tokenizer NER Results:")
print(tokenize_and_ner(punctuation_tokenizer, text))

print("\nNLTK Tokenizer NER Results:")
print(tokenize_and_ner(nltk_tokenizer, text))

print("\nSpaCy Tokenizer NER Results:")
print(tokenize_and_ner(spacy_tokenizer, text))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Whitespace Tokenizer NER Results:
Tokenized Text: Elon Musk is the CEO of SpaceX, and he lives in Texas.
[('El', 'I-PER'), ('##on', 'I-PER'), ('Mu', 'I-PER'), ('##sk', 'I-PER'), ('Space', 'I-ORG'), ('##X', 'I-ORG'), ('Texas', 'I-LOC')]

Punctuation Tokenizer NER Results:
Tokenized Text: Elon Musk is the CEO of SpaceX , and he lives in Texas .
[('El', 'I-PER'), ('##on', 'I-PER'), ('Mu', 'I-PER'), ('##sk', 'I-PER'), ('Space', 'I-ORG'), ('##X', 'I-ORG'), ('Texas', 'I-LOC')]

NLTK Tokenizer NER Results:
Tokenized Text: Elon Musk is the CEO of SpaceX , and he lives in Texas .
[('El', 'I-PER'), ('##on', 'I-PER'), ('Mu', 'I-PER'), ('##sk', 'I-PER'), ('Space', 'I-ORG'), ('##X', 'I-ORG'), ('Texas', 'I-LOC')]

SpaCy Tokenizer NER Results:
Tokenized Text: Elon Musk is the CEO of SpaceX , and he lives in Texas .
[('El', 'I-PER'), ('##on', 'I-PER'), ('Mu', 'I-PER'), ('##sk', 'I-PER'), ('Space', 'I-ORG'), ('##X', 'I-ORG'), ('Texas', 'I-LOC')]


## Tokenization Impact on Text Generation

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
import torch

# Initialize GPT-2 model and tokenizer from Hugging Face
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define a sample text
text = "The quick brown fox jumps over the lazy dog."

# Create a custom BPE tokenizer
bpe_tokenizer = Tokenizer(BPE())
bpe_tokenizer.pre_tokenizer = Whitespace()
bpe_trainer = BpeTrainer(vocab_size=1000, min_frequency=2)
bpe_tokenizer.train_from_iterator([text], trainer=bpe_trainer)

# Function to tokenize and generate text with GPT-2
def generate_text(tokenizer, model, prompt, max_length=50):
    # Tokenize input text
    tokens = tokenizer(prompt, return_tensors="pt")
    # Generate text
    output = model.generate(tokens['input_ids'], max_length=max_length, num_return_sequences=1)
    # Decode output tokens
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Function to apply BPE tokenizer and generate text
def bpe_generate_text(prompt):
    # Tokenize with BPE tokenizer
    encoded = bpe_tokenizer.encode(prompt)
    tokens = encoded.tokens
    tokenized_prompt = " ".join(tokens)
    print(f"BPE Tokenized Prompt: {tokenized_prompt}")
    return generate_text(gpt2_tokenizer, gpt2_model, tokenized_prompt)

# Generate and print results
print("GPT-2 Tokenizer Generated Text:")
print(generate_text(gpt2_tokenizer, gpt2_model, text))

print("\nBPE Tokenizer Generated Text:")
print(bpe_generate_text(text))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


GPT-2 Tokenizer Generated Text:


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The quick brown fox jumps over the lazy dog.

"I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I

BPE Tokenizer Generated Text:
BPE Tokenized Prompt: T he q u i c k b r o w n f o x j u m p s o v e r t he l a z y d o g .
T he q u i c k b r o w n f o x j u m p s o v e r t he l a z y d o g.................


## Sentiment analysis

In [None]:
from transformers import pipeline

# Initialize the sentiment analysis pipeline
sentiment_analyzer = pipeline('sentiment-analysis')

# Sample texts for sentiment analysis
texts = [
    "I love using the new features in this app!",
    "The service was terrible and I am very disappointed.",
    "It's an average movie with some good moments.",
    "The product exceeded my expectations. Highly recommend!"
]

# Perform sentiment analysis
results = sentiment_analyzer(texts)

# Print the results
for text, result in zip(texts, results):
    print(f"Text: {text}")
    print(f"Sentiment: {result['label']}, Score: {result['score']:.4f}\n")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


results
[{'label': 'POSITIVE', 'score': 0.9994829893112183}, {'label': 'NEGATIVE', 'score': 0.999747097492218}, {'label': 'POSITIVE', 'score': 0.9998431205749512}, {'label': 'POSITIVE', 'score': 0.9998568296432495}]
Text: I love using the new features in this app!
Sentiment: POSITIVE, Score: 0.9995

Text: The service was terrible and I am very disappointed.
Sentiment: NEGATIVE, Score: 0.9997

Text: It's an average movie with some good moments.
Sentiment: POSITIVE, Score: 0.9998

Text: The product exceeded my expectations. Highly recommend!
Sentiment: POSITIVE, Score: 0.9999



In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
#import nltk
#nltk.download('punkt')

# Sample text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize the text
tokens = word_tokenize(text.lower())

# Train a Word2Vec model
model = Word2Vec([tokens], vector_size=50, window=5, min_count=1, sg=0)

# Retrieve word embeddings
vector_fox = model.wv['fox']
vector_dog = model.wv['dog']

print(f"Vector for 'fox': {vector_fox}")
print(f"Vector for 'dog': {vector_dog}")

# Find most similar words
similar_words = model.wv.most_similar('fox')
print(f"Words similar to 'fox': {similar_words}")


Vector for 'fox': [-0.01648536  0.01859871 -0.00039532 -0.00393455  0.00920726 -0.00819063
  0.00548623  0.01387993  0.01213085 -0.01502159  0.0187647   0.00934362
  0.00793224 -0.01248701  0.01691996 -0.00430033  0.01765038 -0.01072401
 -0.01625884  0.01364912  0.00334239 -0.00439702  0.0190272   0.01898771
 -0.01954809  0.00501046  0.01231338  0.00774491  0.00404557  0.000861
  0.00134726 -0.00764127 -0.0142805  -0.00417774  0.0078478   0.01763737
  0.0185183  -0.01195187 -0.01880534  0.01952875  0.00685957  0.01033223
  0.01256469 -0.00560853  0.01464541  0.00566054  0.00574201 -0.00476074
 -0.0062565  -0.00474028]
Vector for 'dog': [-0.01723938  0.00733148  0.01037977  0.01148388  0.01493384 -0.01233535
  0.00221123  0.01209456 -0.0056801  -0.01234705 -0.00082045 -0.0167379
 -0.01120002  0.01420908  0.00670508  0.01445134  0.01360049  0.01506148
 -0.00757831 -0.00112361  0.00469675 -0.00903806  0.01677746 -0.01971633
  0.01352928  0.00582883 -0.00986566  0.00879638 -0.00347915  0.0

In [None]:
import requests

bible = requests.get('https://openbible.com/textfiles/kjv.txt').text
bible = bible[90:]
bible[:200]

'Genesis 1:1\tIn the beginning God created the heaven and the earth.\nGenesis 1:2\tAnd the earth was without form, and void; and darkness [was] upon the face of the deep. And the Spirit of God moved upon '

In [None]:
bible.splitlines()[1]

'Genesis 1:2\tAnd the earth was without form, and void; and darkness [was] upon the face of the deep. And the Spirit of God moved upon the face of the waters.'

In [None]:
bible_verses = [verse.split('\t')[1] for verse in bible.splitlines()]
bible_verses[:10]

['In the beginning God created the heaven and the earth.',
 'And the earth was without form, and void; and darkness [was] upon the face of the deep. And the Spirit of God moved upon the face of the waters.',
 'And God said, Let there be light: and there was light.',
 'And God saw the light, that [it was] good: and God divided the light from the darkness.',
 'And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day.',
 'And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters.',
 'And God made the firmament, and divided the waters which [were] under the firmament from the waters which [were] above the firmament: and it was so.',
 'And God called the firmament Heaven. And the evening and the morning were the second day.',
 'And God said, Let the waters under the heaven be gathered together unto one place, and let the dry [land] appear: and it was so.',
 'And God called the dry

In [None]:
bible_text = ' '.join(bible_verses)

In [None]:
tokens = word_tokenize(bible_text.lower())
len(tokens)

957901

In [None]:
model = Word2Vec([tokens], vector_size=100, window=5, min_count=1, sg=0)
model

<gensim.models.word2vec.Word2Vec at 0x7e2d723a6bf0>

In [None]:
vector_god = model.wv['god']
vector_jesus = model.wv['jesus']

print(f"Vector for 'God': {vector_god}")
print(f"Vector for 'Jesus': {vector_jesus}")

Vector for 'God': [ 1.10151820e-01  2.23351717e-01 -2.50955969e-01  5.38868070e-01
  3.03258270e-01 -4.39407587e-01  4.03590463e-02  8.62978339e-01
 -3.98967355e-01 -4.35739458e-01  1.79952487e-01 -5.46692729e-01
 -2.38755733e-01  4.07779753e-01  4.37981784e-01 -1.10808857e-01
  4.71028268e-01 -1.51482616e-02 -1.45379886e-01 -5.91193259e-01
 -4.08541292e-01 -4.46228981e-02 -6.20529661e-03 -4.47401226e-01
 -2.73928586e-02 -1.37753934e-01 -2.75020063e-01  4.75306474e-02
 -3.06266546e-01  1.74939990e-01  6.27728164e-01 -3.39444757e-01
  7.31381536e-01 -6.22989058e-01  1.37449145e-01  7.57342458e-01
  1.24489866e-01  1.44603670e-01  5.32249883e-02 -4.15649593e-01
  3.28681350e-01 -1.89512119e-01 -6.19135909e-02 -1.11863025e-01
  3.77866417e-01 -3.82687330e-01 -4.64423180e-01  4.54800688e-02
  3.73258114e-01  1.83249876e-01  4.93123531e-01  2.56650094e-02
 -7.39880204e-02  1.73501283e-01  1.54798524e-02  7.36247599e-02
  2.71006301e-02 -9.23239365e-02 -3.80230278e-01  4.11156654e-01
  9.582

In [None]:
# Find most similar words
similar_words = model.wv.most_similar('god')
print(f"Words similar to 'God': {similar_words}")

Words similar to 'God': [(';', 0.9998259544372559), ('the', 0.999822199344635), ('and', 0.9998204708099365), (',', 0.9998149275779724), ('upon', 0.99980628490448), ('he', 0.9997936487197876), ('.', 0.9997931718826294), (':', 0.9997916221618652), ('in', 0.9997899532318115), ('a', 0.9997780323028564)]


In [None]:
# Find most similar words
similar_words = model.wv.most_similar('jesus')
print(f"Words similar to 'Jesus': {similar_words}")

Words similar to 'Jesus': [('godly', 0.3745896816253662), ('kareah', 0.36409634351730347), ('eightieth', 0.3470560610294342), ('exchange', 0.3424372375011444), ('jarmuth', 0.3419804573059082), ('readeth', 0.3345049023628235), ('whiter', 0.33197978138923645), ('baana', 0.3273734748363495), ('obedience', 0.3242563307285309), ('zechariah', 0.3153020441532135)]
