In [1]:
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to C:\Users\RAVI
[nltk_data]     PATHAK\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\RAVI
[nltk_data]     PATHAK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\RAVI PATHAK\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
paragraph = """
SpaCy is an NLP library that excels at large-scale text processing. It can be used for tasks such as tokenization, lemmatization, 
part-of-speech tagging, and named entity recognition. NLTK is another Python library that provides easy-to-use interfaces 
for over 50 corpora and lexical resources. Both libraries have their strengths, and understanding when to use each is important.
"""

In [6]:
sentences = sent_tokenize(paragraph)
print("Sentence Tokenization:")
print(sentences)


Sentence Tokenization:
['\nSpaCy is an NLP library that excels at large-scale text processing.', 'It can be used for tasks such as tokenization, lemmatization, \npart-of-speech tagging, and named entity recognition.', 'NLTK is another Python library that provides easy-to-use interfaces \nfor over 50 corpora and lexical resources.', 'Both libraries have their strengths, and understanding when to use each is important.']


In [7]:
words = word_tokenize(paragraph)
print("\nWord Tokenization (NLTK):")
print(words)


Word Tokenization (NLTK):
['SpaCy', 'is', 'an', 'NLP', 'library', 'that', 'excels', 'at', 'large-scale', 'text', 'processing', '.', 'It', 'can', 'be', 'used', 'for', 'tasks', 'such', 'as', 'tokenization', ',', 'lemmatization', ',', 'part-of-speech', 'tagging', ',', 'and', 'named', 'entity', 'recognition', '.', 'NLTK', 'is', 'another', 'Python', 'library', 'that', 'provides', 'easy-to-use', 'interfaces', 'for', 'over', '50', 'corpora', 'and', 'lexical', 'resources', '.', 'Both', 'libraries', 'have', 'their', 'strengths', ',', 'and', 'understanding', 'when', 'to', 'use', 'each', 'is', 'important', '.']


In [8]:
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
print("\nFiltered Words (Stopwords Removed):")
print(filtered_words)



Filtered Words (Stopwords Removed):
['SpaCy', 'NLP', 'library', 'excels', 'large-scale', 'text', 'processing', '.', 'used', 'tasks', 'tokenization', ',', 'lemmatization', ',', 'part-of-speech', 'tagging', ',', 'named', 'entity', 'recognition', '.', 'NLTK', 'another', 'Python', 'library', 'provides', 'easy-to-use', 'interfaces', '50', 'corpora', 'lexical', 'resources', '.', 'libraries', 'strengths', ',', 'understanding', 'use', 'important', '.']


In [9]:
pos_tags = pos_tag(filtered_words)
print("\nPOS Tagging (NLTK):")
print(pos_tags)


POS Tagging (NLTK):
[('SpaCy', 'NNP'), ('NLP', 'NNP'), ('library', 'JJ'), ('excels', 'NNS'), ('large-scale', 'JJ'), ('text', 'NN'), ('processing', 'NN'), ('.', '.'), ('used', 'VBN'), ('tasks', 'NNS'), ('tokenization', 'NN'), (',', ','), ('lemmatization', 'NN'), (',', ','), ('part-of-speech', 'JJ'), ('tagging', 'NN'), (',', ','), ('named', 'VBN'), ('entity', 'NN'), ('recognition', 'NN'), ('.', '.'), ('NLTK', 'NNP'), ('another', 'DT'), ('Python', 'NNP'), ('library', 'NN'), ('provides', 'VBZ'), ('easy-to-use', 'JJ'), ('interfaces', 'NNS'), ('50', 'CD'), ('corpora', 'NNS'), ('lexical', 'JJ'), ('resources', 'NNS'), ('.', '.'), ('libraries', 'NNS'), ('strengths', 'NNS'), (',', ','), ('understanding', 'VBG'), ('use', 'NN'), ('important', 'JJ'), ('.', '.')]


In [10]:
doc = nlp(paragraph)

In [11]:

print("\nLemmatization (SpaCy):")
for token in doc:
    print(f"Word: {token.text}, Lemma: {token.lemma_}, POS: {token.pos_}")


Lemmatization (SpaCy):
Word: 
, Lemma: 
, POS: SPACE
Word: SpaCy, Lemma: SpaCy, POS: PROPN
Word: is, Lemma: be, POS: AUX
Word: an, Lemma: an, POS: DET
Word: NLP, Lemma: NLP, POS: PROPN
Word: library, Lemma: library, POS: NOUN
Word: that, Lemma: that, POS: PRON
Word: excels, Lemma: excel, POS: VERB
Word: at, Lemma: at, POS: ADP
Word: large, Lemma: large, POS: ADJ
Word: -, Lemma: -, POS: PUNCT
Word: scale, Lemma: scale, POS: NOUN
Word: text, Lemma: text, POS: NOUN
Word: processing, Lemma: processing, POS: NOUN
Word: ., Lemma: ., POS: PUNCT
Word: It, Lemma: it, POS: PRON
Word: can, Lemma: can, POS: AUX
Word: be, Lemma: be, POS: AUX
Word: used, Lemma: use, POS: VERB
Word: for, Lemma: for, POS: ADP
Word: tasks, Lemma: task, POS: NOUN
Word: such, Lemma: such, POS: ADJ
Word: as, Lemma: as, POS: ADP
Word: tokenization, Lemma: tokenization, POS: NOUN
Word: ,, Lemma: ,, POS: PUNCT
Word: lemmatization, Lemma: lemmatization, POS: NOUN
Word: ,, Lemma: ,, POS: PUNCT
Word: 
, Lemma: 
, POS: SPACE
Wo

In [12]:
print("\nNamed Entity Recognition (SpaCy):")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Named Entity Recognition (SpaCy):
Entity: SpaCy, Label: PERSON
Entity: NLP, Label: ORG
Entity: NLTK, Label: ORG
Entity: over 50, Label: CARDINAL
