In [2]:
# Install NLTK (if not already installed)
!pip install nltk

import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter

# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# ------------------ Corpus ------------------
text = """
Natural Language Processing (NLP) is a branch of Artificial Intelligence that focuses on the interaction
between computers and humans using natural language. NLP techniques are widely used in applications
such as chatbots, machine translation, sentiment analysis, and information retrieval. By using libraries like
NLTK, developers can tokenize text, remove stopwords, and analyze linguistic patterns efficiently. As data
grows rapidly, NLP plays a crucial role in extracting meaningful insights from unstructured text data.
"""

# ------------------ Task 1 ------------------
# Tokenize paragraph into sentences
sentences = sent_tokenize(text)
print("1) Sentences:")
print(sentences)

# ------------------ Task 2 ------------------
# Tokenize each sentence into words
words = []
for sentence in sentences:
    words.extend(word_tokenize(sentence))

print("\n2) Words:")
print(words)

# ------------------ Task 3 ------------------
# Convert all words to lowercase
words = [word.lower() for word in words]

# ------------------ Task 4 ------------------
# Remove punctuation
words = [word for word in words if word not in string.punctuation]

# ------------------ Task 5 ------------------
# Remove English stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

print("\n3–5) Cleaned Words:")
print(filtered_words)

# ------------------ Task 6 ------------------
# Count total number of remaining words
print("\n6) Total remaining words:", len(filtered_words))

# ------------------ Task 7 ------------------
# Top 10 most frequent words
freq = Counter(filtered_words)
top10 = freq.most_common(10)

print("\n7) Top 10 most frequent words:")
for word, count in top10:
    print(f"{word} : {count}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


1) Sentences:
['\nNatural Language Processing (NLP) is a branch of Artificial Intelligence that focuses on the interaction\nbetween computers and humans using natural language.', 'NLP techniques are widely used in applications\nsuch as chatbots, machine translation, sentiment analysis, and information retrieval.', 'By using libraries like\nNLTK, developers can tokenize text, remove stopwords, and analyze linguistic patterns efficiently.', 'As data\ngrows rapidly, NLP plays a crucial role in extracting meaningful insights from unstructured text data.']

2) Words:
['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'branch', 'of', 'Artificial', 'Intelligence', 'that', 'focuses', 'on', 'the', 'interaction', 'between', 'computers', 'and', 'humans', 'using', 'natural', 'language', '.', 'NLP', 'techniques', 'are', 'widely', 'used', 'in', 'applications', 'such', 'as', 'chatbots', ',', 'machine', 'translation', ',', 'sentiment', 'analysis', ',', 'and', 'information', 'retrieval',