In [3]:
!pip install nltk



In [4]:
import nltk
nltk.download('punkt')  # Tokenizer models
nltk.download('wordnet')  # For lemmatization
nltk.download('stopwords')  # Common stop words


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
#Tokenization
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Sample text
text = "I love NLP! It's fascinating to learn about text processing."

# Tokenizing into words
word_tokens = word_tokenize(text)
print("Word Tokens:", word_tokens)

# Tokenizing into sentences
sentence_tokens = sent_tokenize(text)
print("Sentence Tokens:", sentence_tokens)

Word Tokens: ['I', 'love', 'NLP', '!', 'It', "'s", 'fascinating', 'to', 'learn', 'about', 'text', 'processing', '.']
Sentence Tokens: ['I love NLP!', "It's fascinating to learn about text processing."]


In [6]:
#Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ["running", "jumps", "easily", "fairly"]
stemmed_words = [stemmer.stem(word) for word in words]
print("Stemmed Words:", stemmed_words)


Stemmed Words: ['run', 'jump', 'easili', 'fairli']


In [7]:
#Lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "better", "easily", "fairly"]
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['running', 'better', 'easily', 'fairly']


In [8]:
#Stop Word Removal
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
words = ["I", "love", "learning", "about", "NLP", "and", "text", "processing"]
filtered_words = [word for word in words if word.lower() not in stop_words]
print("Filtered Words:", filtered_words)


Filtered Words: ['love', 'learning', 'NLP', 'text', 'processing']


In [9]:
#Text Normalization
import re

text = "Text Processing is AMAZING! Isn't it?"
# Lowercase conversion
text = text.lower()

# Removing punctuation
text = re.sub(r'[^\w\s]', '', text)
print("Normalized Text:", text)


Normalized Text: text processing is amazing isnt it


In [10]:
#Corpora and Lexical Resources
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg

sample_text = gutenberg.raw('austen-emma.txt')
print("Sample Text:", sample_text[:500])  # Print first 500 characters of Emma by Jane Austen

Sample Text: [Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died t


[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [11]:
#Text Processing Libraries
nltk.download('averaged_perceptron_tagger')
tokens = nltk.word_tokenize("NLP with NLTK is interesting.")
pos_tags = nltk.pos_tag(tokens)
print("Part-of-Speech Tags:", pos_tags)


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Part-of-Speech Tags: [('NLP', 'NN'), ('with', 'IN'), ('NLTK', 'NNP'), ('is', 'VBZ'), ('interesting', 'JJ'), ('.', '.')]


In [13]:
!pip install spacy
!python -m spacy download en_core_web_md
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_sm


Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation succe

In [14]:
#Pre-trained Models
import spacy

# Load the pre-trained model
nlp = spacy.load("en_core_web_sm")
text = "Apple is looking at buying a U.K. startup for $1 billion."

# Process text
doc = nlp(text)
for token in doc:
	print(token.text, token.lemma_, token.pos_, token.ent_type_)

 # OUTPUT Explained
# "Apple nsubj looking": "Apple" is the nominal subject (nsubj) of "looking," meaning Apple is the entity performing the action of "looking."
# "is aux looking": "is" is an auxiliary verb (aux) supporting the main verb "looking."
# "looking ROOT looking": "looking" is the root of the sentence, meaning it is the main action or verb.
# "at prep looking": "at" is a preposition (prep) linking "looking" with what Apple is looking at.
# "buying pcomp at": "buying" is the prepositional complement (pcomp) to "at," which makes "buying" the focus of what Apple is looking to do.
# "a det startup": "a" is a determiner (det) that modifies "startup."
# "U.K. compound startup": "U.K." is a compound modifier (compound) describing "startup."
# "startup dobj buying": "startup" is the direct object (dobj) of "buying," indicating what is being bought.
# "for prep startup": "for" is a preposition (prep) linking "startup" with the price or reason.
# "$ quantmod billion": "$" is a quantitative modifier (quantmod) modifying "billion."
# "1 compound billion": "1" is a compound modifier (compound) modifying "billion," together forming "1 billion."
# "billion pobj for": "billion" is the object of the preposition (pobj) "for," indicating the price.
# ". punct looking": "." is a punctuation (punct) marking the end of the sentence.

Apple Apple PROPN ORG
is be AUX 
looking look VERB 
at at ADP 
buying buy VERB 
a a DET 
U.K. U.K. PROPN GPE
startup startup NOUN 
for for ADP 
$ $ SYM MONEY
1 1 NUM MONEY
billion billion NUM MONEY
. . PUNCT 


In [15]:
#Named Entity Recognition (NER)
for ent in doc.ents:
	print(ent.text, ent.label_)
#OUTPUT EXPLAINED
#Apple ORG: "Apple" is identified as an organization (ORG), representing a recognized company or corporate entity.
#U.K. GPE: "U.K." is identified as a geopolitical entity (GPE), which includes countries, cities, and regions.
#$1 billion MONEY: "$1 billion" is recognized as an amount of money (MONEY), indicating a financial value.

Apple ORG
U.K. GPE
$1 billion MONEY


In [16]:
#Dependency Parsing
for token in doc:
	print(token.text, token.dep_, token.head.text)
# OUTPUT Explained
# "Apple nsubj looking": "Apple" is labeled as the nominal subject (nsubj) of "looking," meaning Apple is the entity performing the action of "looking."
# "is aux looking": "is" is labeled as an auxiliary verb (aux) supporting the main verb "looking," forming part of the verb phrase "is looking."
# "looking ROOT looking": "looking" is the root of the sentence, meaning it’s the main action or verb in the structure.
# "at prep looking": "at" is a preposition (prep) linking "looking" with the activity or purpose of looking (i.e., what Apple is looking "at").
# "buying pcomp at": "buying" is labeled as the prepositional complement (pcomp) to the preposition "at," indicating the action Apple is focusing on through "looking."
# "a det startup": "a" is a determiner (det) that modifies "startup," specifying that "a" single startup is being referred to.
# "U.K. compound startup": "U.K." is a compound modifier (compound) that further describes "startup," indicating a more specific type or origin of the startup.
# "startup dobj buying": "startup" is the direct object (dobj) of the verb "buying," meaning it’s the entity being bought.
# "for prep startup": "for" is a preposition (prep) that links "startup" with information related to the reason or price (in this case, the amount).
# "$ quantmod billion": "$" is a quantitative modifier (quantmod) that modifies "billion," giving a sense of currency.
# "1 compound billion": "1" is a compound modifier (compound) that modifies "billion," forming "1 billion" together.
# "billion pobj for": "billion" is the object of the preposition "for" (pobj), indicating the price.
# ". punct looking": "." is a punctuation (punct) that marks the end of the sentence.

Apple nsubj looking
is aux looking
looking ROOT looking
at prep looking
buying pcomp at
a det startup
U.K. compound startup
startup dobj buying
for prep startup
$ quantmod billion
1 compound billion
billion pobj for
. punct looking


In [17]:
#Text Similarity
doc1 = nlp("I love NLP.")
doc2 = nlp("Natural language processing is fascinating.")
print("Similarity Score:", doc1.similarity(doc2))


Similarity Score: 0.36213603317675636


  print("Similarity Score:", doc1.similarity(doc2))
