# Tokenization

In [22]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')

text = "Natural Language Processing (NLP) is amazing! Let's explore it."

# Word Tokenization
word_tokens = word_tokenize(text)
print("Word Tokens:", word_tokens)

# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print("Sentence Tokens:", sentence_tokens)


Word Tokens: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'amazing', '!', 'Let', "'s", 'explore', 'it', '.']
Sentence Tokens: ['Natural Language Processing (NLP) is amazing!', "Let's explore it."]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Lowercasing in NLP (Text Preprocessing Example)

In [34]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

text = "Hello World! NLP is Fun."
tokens = word_tokenize(text)

# Lowercasing each token
lower_tokens = [word.lower() for word in tokens]
print(lower_tokens)


['hello', 'world', '!', 'nlp', 'is', 'fun', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Example 4: Lowercasing Using pandas (Useful for Large Text Datasets)

In [35]:
import pandas as pd

data = pd.DataFrame({'Text': ["HELLO World!", "THIS is NLP.", "Machine Learning"]})
data['Lowercase_Text'] = data['Text'].str.lower()

print(data)


               Text    Lowercase_Text
0      HELLO World!      hello world!
1      THIS is NLP.      this is nlp.
2  Machine Learning  machine learning


# STOP WORD REMOVAL

In [24]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
text = "This is a simple example to demonstrate stopword removal."
word = text.split()
filtered_words = [word for word in word if word.lower() not in stop_words]

print("Filtered Words:", filtered_words)

Filtered Words: ['simple', 'example', 'demonstrate', 'stopword', 'removal.']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 3. Stemming (Reducing Words to Their Root Form)

In [25]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in word_tokens]

print("Stemmed Words:", stemmed_words)


Stemmed Words: ['natur', 'languag', 'process', '(', 'nlp', ')', 'is', 'amaz', '!', 'let', "'s", 'explor', 'it', '.']


# 4. Lemmatization (More Advanced Root Word Extraction)

In [28]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]

print("Lemmatized Words:", lemmatized_words)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Lemmatized Words: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'amazing', '!', 'Let', "'s", 'explore', 'it', '.']


In [27]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...


True

# 5. Part-of-Speech (POS) Tagging

In [29]:
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger')

pos_tags = pos_tag(word_tokens)
print("POS Tags:", pos_tags)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...


POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('amazing', 'JJ'), ('!', '.'), ('Let', 'NNP'), ("'s", 'POS'), ('explore', 'VB'), ('it', 'PRP'), ('.', '.')]


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


# 6. Named Entity Recognition (NER)

In [30]:
from nltk.chunk import ne_chunk

nltk.download('maxent_ne_chunker')
nltk.download('words')

ner_tree = ne_chunk(pos_tags)
print("Named Entities:", ner_tree)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...


Named Entities: (S
  Natural/JJ
  Language/NNP
  Processing/NNP
  (/(
  (ORGANIZATION NLP/NNP)
  )/)
  is/VBZ
  amazing/JJ
  !/.
  Let/NNP
  's/POS
  explore/VB
  it/PRP
  ./.)


[nltk_data]   Unzipping corpora\words.zip.


# 7. Frequency Distribution of Words

In [31]:
from nltk.probability import FreqDist

fdist = FreqDist(word_tokens)
print("Most Common Words:", fdist.most_common(5))


Most Common Words: [('Natural', 1), ('Language', 1), ('Processing', 1), ('(', 1), ('NLP', 1)]


# 8. Text Classification using Naïve Bayes Classifier

In [33]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

nltk.download('movie_reviews')

# Prepare the dataset
def extract_features(words):
    return dict([(word, True) for word in words])

positive_reviews = [(extract_features(movie_reviews.words(fileid)), 'pos') 
                    for fileid in movie_reviews.fileids('pos')]
negative_reviews = [(extract_features(movie_reviews.words(fileid)), 'neg') 
                    for fileid in movie_reviews.fileids('neg')]

train_data = positive_reviews[:800] + negative_reviews[:800]
test_data = positive_reviews[800:] + negative_reviews[800:]

# Train the classifier
classifier = NaiveBayesClassifier.train(train_data)

# Test the classifier
print("Accuracy:", nltk.classify.accuracy(classifier, test_data))

# Show the most informative features
classifier.show_most_informative_features(5)


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\KIET\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Accuracy: 0.735
Most Informative Features
             outstanding = True              pos : neg    =     13.9 : 1.0
               insulting = True              neg : pos    =     13.7 : 1.0
              vulnerable = True              pos : neg    =     13.0 : 1.0
               ludicrous = True              neg : pos    =     12.6 : 1.0
             uninvolving = True              neg : pos    =     12.3 : 1.0


# Working with BERT Tokenizer

In [1]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Sample text
text = "Hugging Face makes NLP easy and efficient!"

# Tokenize the text
tokens = tokenizer(text)
print(tokens)


  from .autonotebook import tqdm as notebook_tqdm


{'input_ids': [101, 17662, 2227, 3084, 17953, 2361, 3733, 1998, 8114, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


# Detailed Tokenization Steps-Tokenization- Convert text into tokens (subword units).


In [13]:
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

Tokens: ['hugging', 'face', 'makes', 'nl', '##p', 'easy', 'and', 'efficient', '!']


# Converting token into IDS

In [3]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Token IDs:", token_ids)


Token IDs: [17662, 2227, 3084, 17953, 2361, 3733, 1998, 8114, 999]


# Add special tokens

In [5]:
tokens_with_special = tokenizer.encode(text, add_special_tokens=True)
print("Tokens with Special Tokens:", tokens_with_special)


Tokens with Special Tokens: [101, 17662, 2227, 3084, 17953, 2361, 3733, 1998, 8114, 999, 102]


# Generate Attention Masks - Attention masks tell the model which tokens to focus on (1) and which to ignore (0).

In [7]:
encoded = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

print("Input IDs:", encoded["input_ids"])
print("Attention Mask:", encoded["attention_mask"])


Input IDs: tensor([[  101, 17662,  2227,  3084, 17953,  2361,  3733,  1998,  8114,   999,
           102]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


# Using BERT Tokenizer for Multiple Sentences

In [11]:
texts = ["Transformers are amazing.", "They are used in NLP extensively."]

# Tokenize multiple sentences
encoded_batch = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

print("Input IDs:", encoded_batch["input_ids"])
print("Attention Mask:", encoded_batch["attention_mask"])


Input IDs: tensor([[  101, 19081,  2024,  6429,  1012,   102,     0,     0,     0,     0],
        [  101,  2027,  2024,  2109,  1999, 17953,  2361,  8077,  1012,   102]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


# Working with BERT Tokenizer for a Pair of Sentences

In [12]:
text1 = "BERT is great for NLP."
text2 = "It uses transformers to understand context."

# Encode a pair of sentences
encoded_pair = tokenizer(text1, text2, add_special_tokens=True, return_tensors="pt")

print("Input IDs:", encoded_pair["input_ids"])
print("Token Type IDs:", encoded_pair["token_type_ids"])  # Differentiates Sentence 1 and 2
print("Attention Mask:", encoded_pair["attention_mask"])


Input IDs: tensor([[  101, 14324,  2003,  2307,  2005, 17953,  2361,  1012,   102,  2009,
          3594, 19081,  2000,  3305,  6123,  1012,   102]])
Token Type IDs: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
