Note: Remove # to install dependies

1. Tokenization

In [1]:
import nltk
#nltk.download('punkt')
#nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
text = "Natural Language Processing is fun!"
tokens = word_tokenize(text)
print("Tokens:", tokens)

Tokens: ['Natural', 'Language', 'Processing', 'is', 'fun', '!']


2. Stopwords

In [2]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
filtered = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens:", filtered)

Filtered Tokens: ['Natural', 'Language', 'Processing', 'fun', '!']


3. Stemming

In [3]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stems = [stemmer.stem(word) for word in filtered]
print("Stems:", stems)


Stems: ['natur', 'languag', 'process', 'fun', '!']


4. Lemmatization

In [4]:
import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(word) for word in filtered]
print("Lemmas:", lemmas)

Lemmas: ['Natural', 'Language', 'Processing', 'fun', '!']


5. POS Tagging

In [5]:
import nltk
#nltk.download('averaged_perceptron_tagger_eng')
pos_tags = nltk.pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('is', 'VBZ'), ('fun', 'NN'), ('!', '.')]


6. Named Entity Recognition (NER)

In [6]:
#%pip install spacy
#!python -m spacy download en_core_web_sm

import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Barack Obama was the 44th President of the United States.")
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])

Entities: [('Barack Obama', 'PERSON'), ('44th', 'ORDINAL'), ('the United States', 'GPE')]


7. Chunking

In [7]:
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
tree = cp.parse(pos_tags)
# In Jupyter this opens a GUI window
tree.draw()  

8. Syntax Parsing

In [8]:
for token in doc:
    print(f"{token.text} --> {token.dep_} --> {token.head.text}")

Barack --> compound --> Obama
Obama --> nsubj --> was
was --> ROOT --> was
the --> det --> President
44th --> amod --> President
President --> attr --> was
of --> prep --> President
the --> det --> States
United --> compound --> States
States --> pobj --> of
. --> punct --> was


9. Dependency Parsing

In [9]:
spacy.displacy.render(doc, style="dep", jupyter=True)

10. Bag of Words

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ["NLP is fun", "NLP is powerful"]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['fun' 'is' 'nlp' 'powerful']
[[1 1 1 0]
 [0 1 1 1]]


11. TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus)
print(tfidf.get_feature_names_out())
print(X.toarray())

['fun' 'is' 'nlp' 'powerful']
[[0.70490949 0.50154891 0.50154891 0.        ]
 [0.         0.50154891 0.50154891 0.70490949]]


13. Text Classification

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
texts = ["Win a free iPhone", "Let’s meet tomorrow"]
labels = ["spam", "ham"]
clf = Pipeline([("tfidf", TfidfVectorizer()), ("nb", MultinomialNB())])
clf.fit(texts, labels)
print("Prediction:", clf.predict(["Free money now"]))

Prediction: ['spam']


14. Topic Modeling

In [27]:
from sklearn.decomposition import LatentDirichletAllocation
cv = CountVectorizer()
X = cv.fit_transform(["NLP is cool", "I love machine learning"])
lda = LatentDirichletAllocation(n_components=2)
lda.fit(X)
print("Topics:", lda.components_)

Topics: [[1.49269968 1.49269968 0.50730016 0.50730016 0.50730016 1.49269968]
 [0.50730032 0.50730032 1.49269984 1.49269984 1.49269984 0.50730032]]
