 ### demonstration of noise removal from textual data  including removing regular expressions like hashtags:

In [1]:
import re

# Sample noisy text
text = "This is a #sample tweet! Visit https://example.com for details. Contact me at example@email.com"

# Remove hashtags
text = re.sub(r"#\w+", "", text)

# Remove URLs
text = re.sub(r"http\S+|www\S+", "", text)

# Remove email addresses
text = re.sub(r"\S+@\S+", "", text)

# Remove special characters and extra spaces
text = re.sub(r"[^A-Za-z0-9 ]+", "", text).strip()

print(text)


This is a  tweet Visit  for details Contact me at


### 1. Remove Emoticons & Emojis

In [3]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & pictographs
        "\U0001F680-\U0001F6FF"  # Transport & map symbols
        "\U0001F700-\U0001F77F"  # Alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric symbols
        "\U0001F800-\U0001F8FF"  # Supplemental symbols
        "\U0001F900-\U0001F9FF"  # Supplemental symbols & pictographs
        "\U0001FA00-\U0001FA6F"  # Chess symbols
        "\U0001FA70-\U0001FAFF"  # Other symbols
        "\U00002702-\U000027B0"  # Dingbats
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

text = "Hello ðŸ˜Š, this is a test! ðŸš€ðŸ”¥"
clean_text = remove_emojis(text)
print(clean_text)  # Output: Hello , this is a test!


Hello , this is a test! 


### 2. Normalize Text (Lowercase & Remove Extra Spaces)

In [4]:
def normalize_text(text):
    return ' '.join(text.lower().split())

text = "  This is   an   EXAMPLE   TEXT!  "
normalized_text = normalize_text(text)
print(normalized_text)  # Output: this is an example text!


this is an example text!


### 3.Extract Dates from Mixed Text

In [5]:
import re

def extract_dates(text):
    date_pattern = r'\b(?:\d{1,2}/\d{1,2}/\d{4}|\d{1,2}-\d{1,2}-\d{4}|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}, \d{4})\b'
    return re.findall(date_pattern, text)

text = "Today's date is 03/04/2025. Another format is March 4, 2025 or 03-04-2025."
dates = extract_dates(text)
print(dates)  # Output: ['03/04/2025', 'March 4, 2025', '03-04-2025']


['03/04/2025', 'March 4, 2025', '03-04-2025']
