In [2]:
import nltk;
import string;
from nltk.corpus import stopwords;
from nltk.tokenize import word_tokenize, sent_tokenize;
from nltk.probability import FreqDist;
nltk.download('punkt');
nltk.download('stopwords');

paragraph = """One of my favorite topics is technology, especially how it shapes our everyday lives.
It’s fascinating to see how innovations like artificial intelligence, smart devices, and cloud computing
have transformed the way we communicate, work, and even relax. From voice-controlled assistants that manage
our schedules to wearable tech that tracks our health in real time, technology is making life more efficient
and connected. I’m especially intrigued by how AI is being used in creative fields like music composition,
storytelling, and design""";

text_lower = paragraph.lower();
text_clean = text_lower.translate(str.maketrans('', '', string.punctuation));
print("Text in lowercase with removed punctuation is\n",text_clean);

words = word_tokenize(text_clean);
print("\nWord tokenization\n",words);
sentences = sent_tokenize(paragraph);
print("\nSentence tokenization\n",sentences);

stop_words = set(stopwords.words('english'));
filtered_words = [word for word in words if word not in stop_words];
print("\nText after removing stop words is\n",filtered_words);

freq_dist = FreqDist(filtered_words)
print("\nWord Frequency Counts\n");
for word, freq in freq_dist.items():
    print(f"{word}: {freq}");

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


Text in lowercase with removed punctuation is
  one of my favorite topics is technology especially how it shapes our everyday lives
it’s fascinating to see how innovations like artificial intelligence smart devices and cloud computing
have transformed the way we communicate work and even relax from voicecontrolled assistants that manage
our schedules to wearable tech that tracks our health in real time technology is making life more efficient
and connected i’m especially intrigued by how ai is being used in creative fields like music composition
storytelling and design

Word tokenization
 ['one', 'of', 'my', 'favorite', 'topics', 'is', 'technology', 'especially', 'how', 'it', 'shapes', 'our', 'everyday', 'lives', 'it', '’', 's', 'fascinating', 'to', 'see', 'how', 'innovations', 'like', 'artificial', 'intelligence', 'smart', 'devices', 'and', 'cloud', 'computing', 'have', 'transformed', 'the', 'way', 'we', 'communicate', 'work', 'and', 'even', 'relax', 'from', 'voicecontrolled', 'assist

[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import nltk;
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer;
from nltk.corpus import wordnet;
nltk.download('wordnet');
nltk.download('omw-1.4');

filtered_words = ['one', 'favorite', 'topics', 'technology', 'especially', 'shapes', 'everyday', 'lives', '’', 'fascinating', 'see', 'innovations', 'like', 'artificial', 'intelligence', 'smart', 'devices', 'cloud', 'computing', 'transformed', 'way', 'communicate', 'work', 'even', 'relax', 'voicecontrolled', 'assistants', 'manage', 'schedules', 'wearable', 'tech', 'tracks', 'health', 'real', 'time', 'technology', 'making', 'life', 'efficient', 'connected', '’', 'especially', 'intrigued', 'ai', 'used', 'creative', 'fields', 'like', 'music', 'composition', 'storytelling', 'design'];
porter = PorterStemmer();
lancaster = LancasterStemmer();
lemmatizer = WordNetLemmatizer();
print(f"{'Original':<15}{'Porter':<15}{'Lancaster':<15}{'Lemmatized':<15}");
print("-" * 60);
for word in filtered_words:
    porter_stem = porter.stem(word);
    lancaster_stem = lancaster.stem(word);
    lemmatized = lemmatizer.lemmatize(word);
    print(f"{word:<15}{porter_stem:<15}{lancaster_stem:<15}{lemmatized:<15}");

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


Original       Porter         Lancaster      Lemmatized     
------------------------------------------------------------
one            one            on             one            
favorite       favorit        favorit        favorite       
topics         topic          top            topic          
technology     technolog      technolog      technology     
especially     especi         espec          especially     
shapes         shape          shap           shape          
everyday       everyday       everyday       everyday       
lives          live           liv            life           
’              ’              ’              ’              
fascinating    fascin         fascin         fascinating    
see            see            see            see            
innovations    innov          innov          innovation     
like           like           lik            like           
artificial     artifici       art            artificial     
intelligence   intellig 

In [6]:
import re;
paragraph = """One of my favorite topics is technology, especially how it shapes our everyday lives.
It’s fascinating to see how innovations like artificial intelligence, smart devices, and cloud computing
have transformed the way we communicate, work, and even relax. From voice-controlled assistants that manage
our schedules to wearable tech that tracks our health in real time, technology is making life more efficient
and connected. I’m especially intrigued by how AI is being used in creative fields like music composition,
storytelling, and design.""";

words_more_than_5 = re.findall(r'\b[a-zA-Z]{6,}\b', paragraph);
print("All words with more than 5 letters are\n",words_more_than_5);
numbers = re.findall(r'\b\d+\b', paragraph);
print("\nAll numbers in text are\n",numbers);
capitalized_words = re.findall(r'\b[A-Z][a-z]+\b', paragraph);
print("\nAll capitalized words are\n",capitalized_words);

alpha_words = re.findall(r'\b[a-zA-Z]+\b', paragraph);
print("\nWords containing only alphabets (removing digits and special character)\n",alpha_words);
vowel_words = [word for word in alpha_words if word.lower().startswith(('a', 'e', 'i', 'o', 'u'))];
print("\nAll words starting with a vowel are\n",vowel_words);

All words with more than 5 letters are
 ['favorite', 'topics', 'technology', 'especially', 'shapes', 'everyday', 'fascinating', 'innovations', 'artificial', 'intelligence', 'devices', 'computing', 'transformed', 'communicate', 'controlled', 'assistants', 'manage', 'schedules', 'wearable', 'tracks', 'health', 'technology', 'making', 'efficient', 'connected', 'especially', 'intrigued', 'creative', 'fields', 'composition', 'storytelling', 'design']

All numbers in text are
 []

All capitalized words are
 ['One', 'It', 'From']

Words containing only alphabets (removing digits and special character)
 ['One', 'of', 'my', 'favorite', 'topics', 'is', 'technology', 'especially', 'how', 'it', 'shapes', 'our', 'everyday', 'lives', 'It', 's', 'fascinating', 'to', 'see', 'how', 'innovations', 'like', 'artificial', 'intelligence', 'smart', 'devices', 'and', 'cloud', 'computing', 'have', 'transformed', 'the', 'way', 'we', 'communicate', 'work', 'and', 'even', 'relax', 'From', 'voice', 'controlled', '

In [8]:
import re;

text = """One of my favorite topics is technology, especially how it shapes our everyday lives.
It’s fascinating to see how innovations like artificial intelligence, smart devices, and cloud computing
have transformed the way we communicate, work, and even relax. From voice-controlled assistants that manage
our schedules to wearable tech that tracks our health in real time, technology is making life more efficient
and connected. I’m especially intrigued by how AI is being used in creative fields like music composition,
storytelling, and design.""";

def custom_tokenize(text):
    pattern = r"""
        (?:[a-zA-Z]+(?:['’][a-zA-Z]+)?)     
        |(?:\d+\.\d+)                        
        |(?:\d+)                            
        |(?:[a-zA-Z]+(?:-[a-zA-Z]+)+)       
""";
    tokens = re.findall(pattern, text, re.VERBOSE);
    return tokens;

def clean_text(text):
    text = re.sub(r'\b[\w.-]+?@\w+?\.\w+?\b', '<EMAIL>', text);
    text = re.sub(r'https?://\S+|www\.\S+', '<URL>', text);
    text = re.sub(r'\b(?:\+91\s?)?\d{10}\b', '<PHONE>', text);
    text = re.sub(r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', '<PHONE>', text);
    return text;

cleaned_text = clean_text(text);
print("Cleaned Text:\n", cleaned_text);

tokens = custom_tokenize(cleaned_text);
print("\nCustom Tokens:",tokens);

Cleaned Text:
 One of my favorite topics is technology, especially how it shapes our everyday lives.
It’s fascinating to see how innovations like artificial intelligence, smart devices, and cloud computing
have transformed the way we communicate, work, and even relax. From voice-controlled assistants that manage
our schedules to wearable tech that tracks our health in real time, technology is making life more efficient
and connected. I’m especially intrigued by how AI is being used in creative fields like music composition,
storytelling, and design.

Custom Tokens: ['One', 'of', 'my', 'favorite', 'topics', 'is', 'technology', 'especially', 'how', 'it', 'shapes', 'our', 'everyday', 'lives', 'It’s', 'fascinating', 'to', 'see', 'how', 'innovations', 'like', 'artificial', 'intelligence', 'smart', 'devices', 'and', 'cloud', 'computing', 'have', 'transformed', 'the', 'way', 'we', 'communicate', 'work', 'and', 'even', 'relax', 'From', 'voice', 'controlled', 'assistants', 'that', 'manage', 'ou