In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
document = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.
"""

In [None]:
tokens = word_tokenize(document)

In [None]:
pos_tags = pos_tag(tokens)

In [None]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

In [None]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
for token in stemmed_tokens:
    print(token)

natur
languag
process
(
nlp
)
subfield
linguist
,
comput
scienc
,
artifici
intellig
concern
interact
comput
human
languag
,
particular
program
comput
process
analyz
larg
amount
natur
languag
data
.


In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
for token in lemmatized_tokens:
    print(token)

Natural
language
processing
(
NLP
)
subfield
linguistics
,
computer
science
,
artificial
intelligence
concerned
interaction
computer
human
language
,
particular
program
computer
process
analyze
large
amount
natural
language
data
.


In [None]:
processed_document = ' '.join(lemmatized_tokens)

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([processed_document])
feature_names = tfidf_vectorizer.get_feature_names_out()

In [None]:
print("Original Document:")
print(document)
print("\nProcessed Document:")
print(processed_document)
print("\nTF-IDF Representation:")
for col in tfidf_matrix.nonzero()[1]:
    print(f"{feature_names[col]}: {tfidf_matrix[0, col]}")

Original Document:

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.


Processed Document:
Natural language processing ( NLP ) subfield linguistics , computer science , artificial intelligence concerned interaction computer human language , particular program computer process analyze large amount natural language data .

TF-IDF Representation:
data: 0.16012815380508713
amount: 0.16012815380508713
large: 0.16012815380508713
analyze: 0.16012815380508713
process: 0.16012815380508713
program: 0.16012815380508713
particular: 0.16012815380508713
human: 0.16012815380508713
interaction: 0.16012815380508713
concerned: 0.16012815380508713
intelligence: 0.16012815380508713
artificial: 0.16012815380508713
science: 0.16012815380508713
computer: 0.48038446141526137
lingu