In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag as nltk_pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenize the text into words
    words_list = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words_list = [word for word in words_list if word not in stop_words]

    # Stemming using Porter stemmer
    stemmer = PorterStemmer()
    words_list = [stemmer.stem(word) for word in words_list]

    # Join the preprocessed words into a single string
    preprocessed_text = " ".join(words_list)

    return preprocessed_text

In [3]:
# Sample texts
texts = ["Natural language processing (NLP) is a subfield of artificial intelligence (AI).",
         "I love reading books about NLP.",
         "The quick brown fox jumps over the lazy dog",
         "i don't like NLP at all",
         "NLP techniques are used in various applications."
         ]

# Labels
labels = [1, 1, 0, 0, 1]

# Get part-of-speech tags for the first text
pos_tags_first_text = nltk_pos_tag(word_tokenize(texts[0]))


In [4]:
# Train a text classifier
vectorizer = CountVectorizer()
train_X = [preprocess_text(text) for text in texts]
train_X = vectorizer.fit_transform(train_X)
classifier = MultinomialNB()
classifier.fit(train_X, labels)

In [5]:
# Test the classifier
test_texts = [
    "I'm interested in natural language processing.",
    "This book is about gardening."
]
test_texts_preprocessed = [preprocess_text(text) for text in test_texts]
test_X = vectorizer.transform(test_texts_preprocessed)
predictions = classifier.predict(test_X)

In [6]:
# Print the results
print("Part-of-speech tags for the first text:", pos_tags_first_text)
print("Predictions for test texts:", predictions)


Part-of-speech tags for the first text: [('Natural', 'JJ'), ('language', 'NN'), ('processing', 'NN'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('subfield', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('(', '('), ('AI', 'NNP'), (')', ')'), ('.', '.')]
Predictions for test texts: [1 1]


In [7]:
# Calculate accuracy of the classifier (Note: In a real-world scenario, you would use a validation set.)
train_predictions = classifier.predict(train_X)
train_accuracy = accuracy_score(labels, train_predictions)
print("Training accuracy:", train_accuracy)

Training accuracy: 1.0
