In [10]:
import nltk
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

# Download required NLTK data
nltk.download('punkt')  # This line ensures the 'punkt' tokenizer is available
nltk.download('averaged_perceptron_tagger')  # This line ensures the POS tagger is available

# Sample text data
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the corpus to create the count vector table
count_vector_table = vectorizer.fit_transform(corpus)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Print the count vector table
print("Count Vector Table:")
print(count_vector_table.toarray())
print("\nFeature Names:")
print(feature_names)

# Perform POS tagging on the first document
tokens = word_tokenize(corpus[0])
pos_tags = pos_tag(tokens)

# Print the POS tags
print("\nPOS Tags for the first document:")
print(pos_tags)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Count Vector Table:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

Feature Names:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

POS Tags for the first document:
[('This', 'DT'), ('is', 'VBZ'), ('the', 'DT'), ('first', 'JJ'), ('document', 'NN'), ('.', '.')]
