<a href="https://colab.research.google.com/github/akashgoyal-20/Cognitive-Computing-Assignments/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import nltk
import re
import string
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

# Make sure necessary NLTK packages are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Q1. Favorite Topic Paragraph
paragraph = """Artificial Intelligence (AI) is revolutionizing the world. From self-driving cars
to intelligent virtual assistants, AI's applications are limitless! It helps businesses automate tasks,
improve decision-making, and enhance customer experiences. AI algorithms process massive amounts of data in real-time,
making them invaluable in healthcare, finance, and education. The future of technology is driven by advancements in AI."""

# 1. Convert text to lowercase and remove punctuation
text_lower = paragraph.lower()
text_no_punct = text_lower.translate(str.maketrans('', '', string.punctuation))
print("Text after lowercasing and punctuation removal:\n", text_no_punct)

# 2. Tokenize into words and sentences
words = word_tokenize(text_no_punct)
sentences = sent_tokenize(paragraph)
print("\nTokenized Words:\n", words)
print("\nTokenized Sentences:\n", sentences)

# 3. Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in words if w not in stop_words]
print("\nWords after Stopword Removal:\n", filtered_words)

# 4. Word frequency distribution
word_freq = Counter(filtered_words)
print("\nWord Frequency Distribution (excluding stopwords):\n", word_freq)

# --------------------------------------

# Q2: Stemming and Lemmatization
ps = PorterStemmer()
ls = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

# Apply stemming
porter_stemmed = [ps.stem(w) for w in filtered_words]
lancaster_stemmed = [ls.stem(w) for w in filtered_words]
print("\nPorter Stemmer:\n", porter_stemmed)
print("\nLancaster Stemmer:\n", lancaster_stemmed)

# Apply lemmatization
lemmatized = [lemmatizer.lemmatize(w) for w in filtered_words]
print("\nLemmatized Words:\n", lemmatized)

# --------------------------------------

# Q3: Regular Expressions and Text Splitting

# a. Extract words with more than 5 letters
words_5plus = re.findall(r'\b\w{6,}\b', paragraph)
print("\nWords with more than 5 letters:\n", words_5plus)

# b. Extract all numbers
numbers = re.findall(r'\b\d+\b', paragraph)
print("\nNumbers extracted:\n", numbers)

# c. Extract all capitalized words
capitalized_words = re.findall(r'\b[A-Z][a-z]*\b', paragraph)
print("\nCapitalized Words:\n", capitalized_words)

# 3a. Split text into words with alphabets only
alpha_words = re.findall(r'\b[a-zA-Z]+\b', paragraph)
print("\nWords with only alphabets:\n", alpha_words)

# 3b. Extract words starting with a vowel
vowel_words = [word for word in alpha_words if word.lower().startswith(('a', 'e', 'i', 'o', 'u'))]
print("\nWords starting with a vowel:\n", vowel_words)

# --------------------------------------

# Q4: Custom Tokenization & Regex-based Cleaning

def custom_tokenizer(text):
    # Remove punctuation except apostrophes in contractions and hyphens in hyphenated words
    text = re.sub(r'[^\w\s\'\-\.]', '', text)
    # Tokenize words and numbers keeping decimals intact
    tokens = re.findall(r'\d+\.\d+|\w+[-\w]*|\w+', text)
    return tokens

custom_tokens = custom_tokenizer(paragraph)
print("\nCustom Tokenized Words:\n", custom_tokens)

# Regex Substitutions
# a. Replace emails
text_email = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', '<EMAIL>', paragraph)
# b. Replace URLs
text_url = re.sub(r'http[s]?://\S+|www\.\S+', '<URL>', text_email)
# c. Replace phone numbers
text_phone = re.sub(r'(\+?\d{1,3}[- ]?)?\d{3}[- ]?\d{3}[- ]?\d{4}', '<PHONE>', text_url)

print("\nText after regex replacements:\n", text_phone)

Text after lowercasing and punctuation removal:
 artificial intelligence ai is revolutionizing the world from selfdriving cars 
to intelligent virtual assistants ais applications are limitless it helps businesses automate tasks 
improve decisionmaking and enhance customer experiences ai algorithms process massive amounts of data in realtime 
making them invaluable in healthcare finance and education the future of technology is driven by advancements in ai

Tokenized Words:
 ['artificial', 'intelligence', 'ai', 'is', 'revolutionizing', 'the', 'world', 'from', 'selfdriving', 'cars', 'to', 'intelligent', 'virtual', 'assistants', 'ais', 'applications', 'are', 'limitless', 'it', 'helps', 'businesses', 'automate', 'tasks', 'improve', 'decisionmaking', 'and', 'enhance', 'customer', 'experiences', 'ai', 'algorithms', 'process', 'massive', 'amounts', 'of', 'data', 'in', 'realtime', 'making', 'them', 'invaluable', 'in', 'healthcare', 'finance', 'and', 'education', 'the', 'future', 'of', 'technol

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
