In [12]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [7]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/asmita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [51]:
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/asmita/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/asmita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/asmita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/asmita/nltk_data...


True

In [52]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
document = "CSI-DYPIEMR is the Student Chapter of Computer Society of India in Dr.D. Y. Patil Pratishthan's Dr. D. Y. Patil Institute of Engineering,Management, and Research. Computer Society of India is a body of computer professionals in India. It was started on 6 March 1965 by a few computer professionals and has now grown to be the national body representing computer professionals. It has 72 chapters across India, 511 student branches, and 100,000 members."

In [54]:
#Tokenization
sentence = sent_tokenize(document)
words = word_tokenize(document)

In [55]:
#Pos tagging
pos_tags = nltk.pos_tag(words)

In [56]:
#Stop words removal
stop_words = set(stopwords.words("english"))
filtered_words = [word for word in words if word.casefold() not in stop_words]

In [57]:
#Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_words]

In [58]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

In [59]:
#Calculate term frequency (TF)
tf_vectorizer = TfidfVectorizer(use_idf=False, norm = 'l1')
tf_matrix = tf_vectorizer.fit_transform([document])

In [60]:
#Calculate inverse document frequency (IDF)
idf_vectorizer = TfidfVectorizer(use_idf=True)
idf_matrix = idf_vectorizer.fit_transform([document])

In [62]:
print("Tokenization:")
print(words)
print()
print("POS Tagging:")
print(pos_tags)
print()
print("Stop Words Removal:")
print(filtered_words)
print()
print("Stemming:")
print(stemmed_words)
print()
print("Lemmatization:")
print(lemmatized_words)
print()
print("Term Frequency (TF):")
print(tf_matrix.toarray())
print()
print("Inverse Document Frequency (IDF):")
print(idf_matrix.toarray())

Tokenization:
['CSI-DYPIEMR', 'is', 'the', 'Student', 'Chapter', 'of', 'Computer', 'Society', 'of', 'India', 'in', 'Dr.D', '.', 'Y.', 'Patil', 'Pratishthan', "'s", 'Dr.', 'D.', 'Y.', 'Patil', 'Institute', 'of', 'Engineering', ',', 'Management', ',', 'and', 'Research', '.', 'Computer', 'Society', 'of', 'India', 'is', 'a', 'body', 'of', 'computer', 'professionals', 'in', 'India', '.', 'It', 'was', 'started', 'on', '6', 'March', '1965', 'by', 'a', 'few', 'computer', 'professionals', 'and', 'has', 'now', 'grown', 'to', 'be', 'the', 'national', 'body', 'representing', 'computer', 'professionals', '.', 'It', 'has', '72', 'chapters', 'across', 'India', ',', '511', 'student', 'branches', ',', 'and', '100,000', 'members', '.']

POS Tagging:
[('CSI-DYPIEMR', 'NNP'), ('is', 'VBZ'), ('the', 'DT'), ('Student', 'NNP'), ('Chapter', 'NN'), ('of', 'IN'), ('Computer', 'NNP'), ('Society', 'NNP'), ('of', 'IN'), ('India', 'NNP'), ('in', 'IN'), ('Dr.D', 'NNP'), ('.', '.'), ('Y.', 'NNP'), ('Patil', 'NNP'), (