# NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize

In [3]:
nltk.download('punkt_tab')
text="python is a programming language"
tokenized_text=word_tokenize(text)
print(tokenized_text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...


['python', 'is', 'a', 'programming', 'language']


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
filtered_text=[words for words in tokenized_text if words.lower() not in stop_words]
print(filtered_text)

['python', 'programming', 'language']


In [6]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()
stemmed_text=[ps.stem(words) for words in filtered_text]
print(stemmed_text)

['python', 'program', 'languag']


In [7]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()
lemmatized_text=[lemmatizer.lemmatize(words) for words in filtered_text]
print(lemmatized_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...


['python', 'programming', 'language']


In [8]:
from nltk import pos_tag

In [10]:
nltk.download('averaged_perceptron_tagger_eng')
tagged_text=pos_tag(tokenized_text)
print(tagged_text)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


[('python', 'NN'), ('is', 'VBZ'), ('a', 'DT'), ('programming', 'JJ'), ('language', 'NN')]


# SPACY

In [18]:
import spacy

# Load the model using the correct directory path
nlp = spacy.load('en_core_web_sm')

# Process text
text = nlp("Python is a programming language")
token_text = [word.text for word in text]
print(token_text)



['Python', 'is', 'a', 'programming', 'language']


In [19]:
import spacy

# Load the SpaCy model
nlp = spacy.load('en_core_web_sm')

# Process text
text = nlp("Apple is a major tech company headquartered in Cupertino, California.")

# Loop through the entities in the text
for word in text.ents:
    print(word.text, word.label_)


Apple ORG
Cupertino GPE
California GPE


In [20]:
for words in text:
  print(words.text,words.dep_,words.head.text)

Apple nsubj is
is ROOT is
a det company
major amod company
tech compound company
company attr is
headquartered acl company
in prep headquartered
Cupertino pobj in
, punct Cupertino
California appos Cupertino
. punct is


In [22]:
from spacy.lang.en.stop_words import STOP_WORDS
filtered=[words.text for words in text if words.text.lower() not in STOP_WORDS]
print(filtered)

['Apple', 'major', 'tech', 'company', 'headquartered', 'Cupertino', ',', 'California', '.']


In [30]:
pos_tags=[(words.text,words.pos_) for words in text]

In [31]:
print(pos_tags)

[('Apple', 'PROPN'), ('is', 'AUX'), ('a', 'DET'), ('major', 'ADJ'), ('tech', 'NOUN'), ('company', 'NOUN'), ('headquartered', 'VERB'), ('in', 'ADP'), ('Cupertino', 'PROPN'), (',', 'PUNCT'), ('California', 'PROPN'), ('.', 'PUNCT')]


In [33]:
lemmatiz=[words.lemma_ for words in text]
print(lemmatiz)

['Apple', 'be', 'a', 'major', 'tech', 'company', 'headquarter', 'in', 'Cupertino', ',', 'California', '.']
