### 1) Simple tokenization using python

In [1]:
text = "Tokenization is an important step in NLP."
tokens = text.split()
# syntax //string.split(separator, maxsplit)//
print(tokens)

['Tokenization', 'is', 'an', 'important', 'step', 'in', 'NLP.']


### 2) Tokenization using regular expression

In [2]:
import re

text = "Tokenization is an important step in NLP."
tokens = re.findall(r'\b\w+\b', text)
print(tokens)

['Tokenization', 'is', 'an', 'important', 'step', 'in', 'NLP']


### 3) Sentence and word tokenization using NLTK - Punkt - Punctuation-based Tokenizer

The Punkt tokenizer is a pre-trained unsupervised model that learns where sentences end based on punctuation, abbreviations, and capitalization patterns.

In [3]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Added to download the missing resource

from nltk.tokenize import word_tokenize, sent_tokenize

text = "Tokenization is important. It is the first step in NLP."

sentences = sent_tokenize(text)
words = word_tokenize(text)
print("Sentences:", sentences)
print("Words:", words)

[nltk_data] Downloading package punkt to /home/aryaniyaps/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/aryaniyaps/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Sentences: ['Tokenization is important.', 'It is the first step in NLP.']
Words: ['Tokenization', 'is', 'important', '.', 'It', 'is', 'the', 'first', 'step', 'in', 'NLP', '.']


### 4) Sentence and word tokenization using spaCy

spaCy is an open-source Python library for advanced Natural Language Processing. It includes tokenization, sentence segmentation, part-of-speech tagging, lemmatization, named entity recognition (NER), dependency parsing, and word vectors. Unlike rule-based systems, spaCy uses statistical and neural network–based models, making it accurate and fast for large-scale text processing.

In [4]:
import spacy

# load a pre-trained English NLP pipeline into memory so that text can be processed linguistically
# "en_core_web_sm" refers to the English core web – small model

nlp = spacy.load("en_core_web_sm")

text = "Tokenization is an important step in NLP."

doc = nlp(text)
tokens = [token.text for token in doc]

print(tokens)

['Tokenization', 'is', 'an', 'important', 'step', 'in', 'NLP', '.']


### 5) Subword tokenization using Transformers (BERT tokenizer)

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
text = "Tokenization is an important step in NLP."

tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)

print("Tokens:", tokens)
print("Token IDs:", token_ids)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Tokens: ['token', '##ization', 'is', 'an', 'important', 'step', 'in', 'nl', '##p', '.']
Token IDs: [101, 19204, 3989, 2003, 2019, 2590, 3357, 1999, 17953, 2361, 1012, 102]


### 6) Basic Stopword Removal using pure Python

In [6]:
stop_words = {"is", "an", "of", "the", "and"}

text = "Natural Language Processing is an important part of Artificial Intelligence."
tokens = text.split()

# a list comprehension used for stopword removal in Python
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print(filtered_tokens)

['Natural', 'Language', 'Processing', 'important', 'part', 'Artificial', 'Intelligence.']


### 7) Stop word Removal using NLTK

In [7]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

text = "Natural Language Processing is an important part of Artificial Intelligence."

# Load English stopwords
stop_words = set(stopwords.words('english'))

# Tokenize text
tokens = word_tokenize(text)

# Remove stopwords
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

print(filtered_tokens)

['Natural', 'Language', 'Processing', 'important', 'part', 'Artificial', 'Intelligence', '.']


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aryaniyaps/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/aryaniyaps/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 8) Stop word Removal using spaCy

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Natural Language Processing is an important part of Artificial Intelligence."

doc = nlp(text)

filtered_tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]

print(filtered_tokens)

['Natural', 'Language', 'Processing', 'important', 'Artificial', 'Intelligence']


### 9) Simple rule-based stemming (basic Python)

In [9]:
def simple_stem(word):
    for suffix in ["ing", "ed", "ly", "es", "s"]:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

text = "playing played plays"
tokens = text.split()

stemmed_tokens = [simple_stem(word) for word in tokens]
print(stemmed_tokens)

['play', 'play', 'play']


### 10) Stemming using NLTK – Porter Stemmer

In [10]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

text = "Computing computers computed computation"
tokens = word_tokenize(text)

stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in tokens]
print(stemmed_tokens)

['comput', 'comput', 'comput', 'comput']


[nltk_data] Downloading package punkt to /home/aryaniyaps/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 11) Stemming using NLTK – Snowball Stemmer (Improved Porter)

In [11]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")

text = "Running runs ran easily fairly"
tokens = text.split()

stemmed_tokens = [stemmer.stem(word) for word in tokens]

print(stemmed_tokens)

['run', 'run', 'ran', 'easili', 'fair']


### 12) Stemming using RSLP Stemmer

RSLP stands for Removed Suffixes and Longest Prefixes Stemmer. It is a rule-based stemming algorithm originally designed for the Portuguese language and is implemented in NLTK.

In [12]:
from nltk.stem import RSLPStemmer
import nltk
nltk.download('rslp') # Added to download the missing resource

stemmer = RSLPStemmer()

words = ["amigos", "amigável", "amizades", "correndo"]
stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)

[nltk_data] Downloading package rslp to /home/aryaniyaps/nltk_data...


['amig', 'amig', 'amizad', 'corr']


[nltk_data]   Unzipping stemmers/rslp.zip.


### 13) Lemmatization using NLTK (WordNet Lemmatizer)

In [13]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

text = "The children are running faster than the mice"
tokens = word_tokenize(text)

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

print(lemmatized_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/aryaniyaps/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/aryaniyaps/nltk_data...
[nltk_data] Downloading package punkt to /home/aryaniyaps/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['The', 'child', 'are', 'running', 'faster', 'than', 'the', 'mouse']


### 14) Lemmatization with Part-of-Speech (More Accurate)

In [14]:
from nltk.corpus import wordnet

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

nltk.download('averaged_perceptron_tagger_eng') # Corrected resource name

pos_tags = nltk.pos_tag(tokens)

lemmatized_words = [
    lemmatizer.lemmatize(word, get_wordnet_pos(pos))
    for word, pos in pos_tags
]

print(lemmatized_words)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/aryaniyaps/nltk_data...


['The', 'child', 'be', 'run', 'faster', 'than', 'the', 'mouse']


[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


### 15) Lemmatization using spaCy (Recommended)

In [15]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "The children are running faster than the mice"

doc = nlp(text)
lemmatized_tokens = [token.lemma_ for token in doc]

print(lemmatized_tokens)

['the', 'child', 'be', 'run', 'fast', 'than', 'the', 'mouse']


## PART B: Try-Yourself Lab Problems (will be given later)
## PART C: Mini Project (will be given later)