In [None]:
import re
import nltk
import matplotlib.pyplot as plt

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
text = """
One fine morning, Tom! Tom! was walking along the Riverside, thinking about everything and nothing.
Suddenly, he shouted: "What are you doing here?"
The boy, smiling, kept running, running, and laughing!
Sarah asked politely, "Could you please explain?" but no one responded.
Meanwhile, a Flying bird was singing near the School.
The teacher said, "Morning is always the best time for Reading and Writing."
Children were playing, dancing, and enjoying the sunshine.
"Really?" asked John.
The girl looked back, wondering, "Why, why would anyone do that?"
After all, caring, sharing, and loving bring meaning to life.
Books, music, and games—these things matter!
"""


In [None]:
uppercase_words = re.findall(r'\b[A-Z][a-zA-Z]*\b', text)
print(uppercase_words)


['One', 'Tom', 'Tom', 'Riverside', 'Suddenly', 'What', 'The', 'Sarah', 'Could', 'Meanwhile', 'Flying', 'School', 'The', 'Morning', 'Reading', 'Writing', 'Children', 'Really', 'John', 'The', 'Why', 'After', 'Books']


In [None]:
punctuation = re.findall(r'[.,":?!—]', text)
print(punctuation)


[',', '!', '!', ',', '.', ',', ':', '"', '?', '"', ',', ',', ',', ',', '!', ',', '"', '?', '"', '.', ',', '.', ',', '"', '.', '"', ',', ',', '.', '"', '?', '"', '.', ',', ',', '"', ',', '?', '"', ',', ',', ',', '.', ',', ',', '—', '!']


In [None]:
questions = re.findall(r'[^.?!]*\?', text)
print(questions)


[' \nSuddenly, he shouted: "What are you doing here?', ' \nSarah asked politely, "Could you please explain?', ' \n"Really?', ' \nThe girl looked back, wondering, "Why, why would anyone do that?']


In [None]:
ing_words = re.findall(r'\b\w+ing\b', text, re.IGNORECASE)
print(ing_words)


['morning', 'walking', 'thinking', 'everything', 'nothing', 'doing', 'smiling', 'running', 'running', 'laughing', 'Flying', 'singing', 'Morning', 'Reading', 'Writing', 'playing', 'dancing', 'enjoying', 'wondering', 'caring', 'sharing', 'loving', 'bring', 'meaning']


In [None]:
five_letter_words = re.findall(r'\b\w{5}\b', text)
print(five_letter_words)


['along', 'about', 'doing', 'Sarah', 'asked', 'Could', 'asked', 'would', 'After', 'bring', 'Books', 'music', 'games', 'these']


In [None]:
quoted_text = re.findall(r'"[^"]*"', text)
print(quoted_text)


['"What are you doing here?"', '"Could you please explain?"', '"Morning is always the best time for Reading and Writing."', '"Really?"', '"Why, why would anyone do that?"']


In [None]:
words_before_punctuation = re.findall(r'\b\w+(?=[.,!?])', text)
print(words_before_punctuation)


['morning', 'Tom', 'Tom', 'Riverside', 'nothing', 'Suddenly', 'here', 'boy', 'smiling', 'running', 'running', 'laughing', 'politely', 'explain', 'responded', 'Meanwhile', 'School', 'said', 'Writing', 'playing', 'dancing', 'sunshine', 'Really', 'John', 'back', 'wondering', 'Why', 'that', 'all', 'caring', 'sharing', 'life', 'Books', 'music', 'matter']


In [None]:
morning_sentences = re.findall(r'[^.]*\bMorning\b[^.]*\.', text)
print(morning_sentences)


[' \nThe teacher said, "Morning is always the best time for Reading and Writing.']


In [None]:
repeated_words = re.findall(r'\b(\w+)\b(?:\W+\1\b)', text, re.IGNORECASE)
print(repeated_words)


['Tom', 'running', 'Why']


In [None]:
tokens = re.findall(r'\b[a-zA-Z]+\b', text.lower())
unique_words = set(tokens)

print("Total tokens:", len(tokens))
print("Unique word types:", len(unique_words))


Total tokens: 103
Unique word types: 84


In [None]:
stopwords = {'the','is','was','are','and','but','no','one','all','for','to','a'}

content_words = [w for w in tokens if w not in stopwords]
print(content_words[:30])


['fine', 'morning', 'tom', 'tom', 'walking', 'along', 'riverside', 'thinking', 'about', 'everything', 'nothing', 'suddenly', 'he', 'shouted', 'what', 'you', 'doing', 'here', 'boy', 'smiling', 'kept', 'running', 'running', 'laughing', 'sarah', 'asked', 'politely', 'could', 'you', 'please']


In [None]:
unigrams = re.findall(r'\b\w+\b', text.lower())
print(unigrams[:20])


['one', 'fine', 'morning', 'tom', 'tom', 'was', 'walking', 'along', 'the', 'riverside', 'thinking', 'about', 'everything', 'and', 'nothing', 'suddenly', 'he', 'shouted', 'what', 'are']


In [None]:
bigrams = re.findall(r'\b\w+\s+\w+\b', text.lower())
print(bigrams[:20])


['one fine', 'was walking', 'along the', 'thinking about', 'everything and', 'he shouted', 'what are', 'you doing', 'the boy', 'kept running', 'and laughing', 'sarah asked', 'could you', 'please explain', 'but no', 'one responded', 'a flying', 'bird was', 'singing near', 'the school']


In [None]:
trigrams = re.findall(r'\b\w+\s+\w+\s+\w+\b', text.lower())
print(trigrams[:20])


['one fine morning', 'was walking along', 'thinking about everything', 'what are you', 'sarah asked politely', 'could you please', 'but no one', 'a flying bird', 'was singing near', 'the teacher said', 'morning is always', 'the best time', 'for reading and', 'children were playing', 'and enjoying the', 'the girl looked', 'why would anyone', 'and loving bring', 'meaning to life', 'these things matter']


In [None]:
sentences = re.findall(r'[^.!?]+[.!?]', text)
print(sentences)


['\nOne fine morning, Tom!', ' Tom!', ' was walking along the Riverside, thinking about everything and nothing.', ' \nSuddenly, he shouted: "What are you doing here?', '" \nThe boy, smiling, kept running, running, and laughing!', ' \nSarah asked politely, "Could you please explain?', '" but no one responded.', ' \nMeanwhile, a Flying bird was singing near the School.', ' \nThe teacher said, "Morning is always the best time for Reading and Writing.', '" \nChildren were playing, dancing, and enjoying the sunshine.', ' \n"Really?', '" asked John.', ' \nThe girl looked back, wondering, "Why, why would anyone do that?', '" \nAfter all, caring, sharing, and loving bring meaning to life.', ' \nBooks, music, and games—these things matter!']
