# Tokenizer

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = "Natural Language Processing is fun! Let's learn it together."
# Word Tokenization
word_tokens = word_tokenize(text)
print("Word Tokens:", word_tokens)

# Sentence Tokenization
sentence_tokens = sent_tokenize(text)
print("Sentence Tokens:", sentence_tokens)


Word Tokens: ['Natural', 'Language', 'Processing', 'is', 'fun', '!', 'Let', "'s", 'learn', 'it', 'together', '.']
Sentence Tokens: ['Natural Language Processing is fun!', "Let's learn it together."]


# Stemming

In [3]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ["running", "runner", "ran", "easily", "fairly"]
stemmed_words = [stemmer.stem(word) for word in words]

print("Stemmed Words:", stemmed_words)


Stemmed Words: ['run', 'runner', 'ran', 'easili', 'fairli']


# Lemmatization

In [4]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "runner", "ran", "easily", "fairly"]
lemmatized_words = [lemmatizer.lemmatize(word, pos="v") for word in words]

print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['run', 'runner', 'run', 'easily', 'fairly']


# Name Entities Recognization

In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = "Apple is looking at buying a UK-based startup for $1 billion."

doc = nlp(text)
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")


Apple: ORG
UK: GPE
$1 billion: MONEY


# Ngram

In [6]:
import nltk
from nltk.util import ngrams

# Example text
text = "I love natural language processing"

# Tokenize the text
tokens = text.split()

# Generate bigrams (n=2)
bigrams = list(ngrams(tokens, 2))

print(bigrams)


[('I', 'love'), ('love', 'natural'), ('natural', 'language'), ('language', 'processing')]


# Parts-of-Speech tagging 

In [7]:
# Importing the NLTK library
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Sample text
#text = "NLTK is a powerful library for natural language processing."

# Performing PoS tagging
pos_tags = pos_tag(words)

# Displaying the PoS tagged result in separate lines
print("Original Text:")
print(text)

print("\nPoS Tagging Result:")
for word, pos_tag in pos_tags:
	print(f"{word}: {pos_tag}")


Original Text:
I love natural language processing

PoS Tagging Result:
running: VBG
runner: NN
ran: VBD
easily: RB
fairly: RB


In [8]:
#importing libraries 
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "SpaCy is a popular natural language processing library."

# Process the text with SpaCy
doc = nlp(text)

# Display the PoS tagged result
print("Original Text: ", text)
print("PoS Tagging Result:")
for token in doc:
	print(f"{token.text}: {token.pos_}")


Original Text:  SpaCy is a popular natural language processing library.
PoS Tagging Result:
SpaCy: PROPN
is: AUX
a: DET
popular: ADJ
natural: ADJ
language: NOUN
processing: NOUN
library: NOUN
.: PUNCT


# Stopwords Removal

In [9]:
import spacy

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "There is a pen on the table"

# Process the text using spaCy
doc = nlp(text)

# Remove stopwords
filtered_words = [token.text for token in doc if not token.is_stop]

# Join the filtered words to form a clean text
clean_text = ' '.join(filtered_words)

print("Original Text:", text)
print("Text after Stopword Removal:", clean_text)


Original Text: There is a pen on the table
Text after Stopword Removal: pen table


In [13]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.util import ngrams
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import spacy

nlp=spacy.load("en_core_web_sm")



text = "Peter is better student and runner. He loves to play games. Everyone likes his charcter.He is a good boy. Anna is his friend who lives in London"

text=text.lower()

word_token=word_tokenize(text)
sent_token=word_tokenize(text)
stemmer=PorterStemmer()
stemmed_word=[stemmer.stem(word)for word in word_token]
lemmatizer=WordNetLemmatizer()
lemmatized_word=[lemmatizer.lemmatize(word,pos='v')for word in word_token]


ner=nlp(text)
for ent in ner.ents:
    print(f"{ent.text}:{ent.label_}")

filter_word=[token.text for token in doc if not token.is_stop]
clean_text=" ".join(filter_word)
print("Clean text:",clean_text)

bigrams=list(ngrams(word_token,2))

pos_tags=pos_tag(word_token)

for word,pos_tag in pos_tags:
    print(f"{word}: {pos_tag}")

print("Stemmed Words:", stemmed_word)
print("Lemmatized Words:", lemmatized_word)

print("-------------------------------------------------")
print("bigrams",bigrams)

vectorizer1=CountVectorizer()
vect_matrix=vectorizer1.fit_transform(lemmatized_word)
print("Vectorization using Countvectorizer")
print(vect_matrix.toarray())

vectorizer2=TfidfVectorizer()
tf_matrix=vectorizer2.fit_transform(lemmatized_word)
print("Vectorization using TfidfVectorizer")
print(tf_matrix.toarray())

peter:PERSON
anna:PERSON
london:GPE
Clean text: pen table
peter: NN
is: VBZ
better: RBR
student: NN
and: CC
runner: NN
.: .
he: PRP
loves: VBZ
to: TO
play: VB
games: NNS
.: .
everyone: NN
likes: VBZ
his: PRP$
charcter.he: NN
is: VBZ
a: DT
good: JJ
boy: NN
.: .
anna: NN
is: VBZ
his: PRP$
friend: NN
who: WP
lives: VBZ
in: IN
london: NN
Stemmed Words: ['peter', 'is', 'better', 'student', 'and', 'runner', '.', 'he', 'love', 'to', 'play', 'game', '.', 'everyon', 'like', 'hi', 'charcter.h', 'is', 'a', 'good', 'boy', '.', 'anna', 'is', 'hi', 'friend', 'who', 'live', 'in', 'london']
Lemmatized Words: ['peter', 'be', 'better', 'student', 'and', 'runner', '.', 'he', 'love', 'to', 'play', 'game', '.', 'everyone', 'like', 'his', 'charcter.he', 'be', 'a', 'good', 'boy', '.', 'anna', 'be', 'his', 'friend', 'who', 'live', 'in', 'london']
-------------------------------------------------
bigrams [('peter', 'is'), ('is', 'better'), ('better', 'student'), ('student', 'and'), ('and', 'runner'), ('runner'

# Vectorization Methods

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample data
documents = [
    "I love programming in Python",
    "Python programming is fun",
    "I love solving problems using Python"
]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents
bow_matrix = vectorizer.fit_transform(documents)

# Display the vocabulary and the vectorized representation
print("Vocabulary:", vectorizer.vocabulary_)

print("BoW Matrix:\n", bow_matrix.toarray())


Vocabulary: {'love': 3, 'programming': 5, 'in': 1, 'python': 6, 'is': 2, 'fun': 0, 'solving': 7, 'problems': 4, 'using': 8}
BoW Matrix:
 [[0 1 0 1 0 1 1 0 0]
 [1 0 1 0 0 1 1 0 0]
 [0 0 0 1 1 0 1 1 1]]


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer2=TfidfVectorizer()

tf_matrix=vectorizer2.fit_transform(documents)

print(tf_matrix.toarray())
print(vectorizer2.vocabulary_)

[[0.         0.63174505 0.         0.4804584  0.         0.4804584
  0.37311881 0.         0.        ]
 [0.5844829  0.         0.5844829  0.         0.         0.44451431
  0.34520502 0.         0.        ]
 [0.         0.         0.         0.38376993 0.50461134 0.
  0.29803159 0.50461134 0.50461134]]
{'love': 3, 'programming': 5, 'in': 1, 'python': 6, 'is': 2, 'fun': 0, 'solving': 7, 'problems': 4, 'using': 8}


In [16]:
print(doc)

There is a pen on the table
