In [2]:
import re

In [3]:
# Tokenization: Splitting text into words or sentences
text = "Hello world! Let's learn NLP."
tokens = re.findall(r'\b\w+\b', text)
print(tokens)


['Hello', 'world', 'Let', 's', 'learn', 'NLP']


In [4]:
# Removing special characters or punctuation
text = "Hello, world! NLP is awesome."
clean_text = re.sub(r'[^\w\s]', '', text)
print(clean_text)



Hello world NLP is awesome


In [5]:
# Extracting email addresses from text
text = "Contact us at support@example.com or sales@company.org."
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
print(emails)

['support@example.com', 'sales@company.org']


In [6]:
#Extracting dates in various formats

text = "Important dates: 2024-08-24, 24/08/2024, August 24, 2024."
dates = re.findall(r'\b\d{4}[-/]\d{2}[-/]\d{2}\b|\b\d{2}[-/]\d{2}[-/]\d{4}\b|\b[A-Z][a-z]+\s\d{1,2},\s\d{4}\b', text)
print(dates)

['2024-08-24', '24/08/2024', 'August 24, 2024']


In [7]:
#Finding hashtags or mentions in social media text
text = "Follow the latest trends! #AI #MachineLearning"
hashtags = re.findall(r'#\w+', text)
print(hashtags)

['#AI', '#MachineLearning']


In [8]:
#Removing extra whitespace or tabs

text = "This   is    a    text   with    extra  spaces."
clean_text = re.sub(r'\s+', ' ', text).strip()
print(clean_text)

This is a text with extra spaces.


In [9]:
#Extracting URLs from text
text = "Check out https://www.example.com and http://www.test.com."
urls = re.findall(r'(https?://[^\s]+)', text)
print(urls)

['https://www.example.com', 'http://www.test.com.']


In [10]:
# Identifying phone numbers
text = "Call us at (123) 456-7890 or 123-456-7890."
phone_numbers = re.findall(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
print(phone_numbers)

['(123) 456-7890', '123-456-7890']


In [11]:
 #Detecting sequences like dates, times, or IDs
text = "Your order ID is ORD-1234-5678."
order_id = re.findall(r'ORD-\d{4}-\d{4}', text)
print(order_id)

['ORD-1234-5678']


In [12]:
# 10. Detecting repeated patterns

text = "This is is a test."
repeated_words = re.findall(r'\b(\w+)\s+\1\b', text)
print(repeated_words)

['is']


In [13]:
# List of stop words
stop_words = ["the", "is", "in", "and", "a", "an", "to", "on"]

# Create regex pattern from stop words list
stop_words_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in stop_words) + r')\b'

# Example text
text = "This is a simple text containing stop words like the and a."

# Find all stop words in the text
found_stop_words = re.findall(stop_words_pattern, text, flags=re.IGNORECASE)

print("Found stop words:", found_stop_words)

Found stop words: ['is', 'a', 'the', 'and', 'a']
