# Text Preprocessing
    •Lowercase
    •Regular Expressions
    •Tokenization
    •Stop words removal
    •Stemming and Lemmatization
    •N-grams


## Lowercase

In [None]:
text = "Natural Language Processing (NLP) is amazing! It's used in chatbots, search engines, and more."

# Lowercasing
lowercase_text = text.lower()
print("Lowercase:", lowercase_text)

Lowercase: natural language processing (nlp) is amazing! it's used in chatbots, search engines, and more.


## Regex Operations

### Matching Patterns

In [None]:
import re

# example: matching a 4-digit number
pattern = r'\d{4}'
text = 'The year is 2025'

match = re.search(pattern, text)
if match:
    print(f"Matched: {match.group()}")
else:
    print("No match found")

Matched: 2025


### Extracting Substrings

In [None]:
text = 'Contact us at info@example.com'
pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'

emails = re.findall(pattern, text)
print("Extracted emails:", emails)

Extracted emails: ['info@example.com']


### Replacing Substrings

In [None]:
text = 'The upcoming movie in 12/30/2024 until 01-02-2025'
pattern = r'(\d{2})[-/](\d{2})[-/](\d{4})'
replacement = r'\3-\1-\2'  # reformat to YYYY-MM-DD

formatted_text = re.sub(pattern, replacement, text)
print("Formatted text:", formatted_text)

Formatted text: The upcoming movie in 2024-12-30 until 2025-01-02


### Splitting Strings

In [None]:
# splitting a string by commas or spaces
text = 'apple, banana, cherry orange'
pattern = r'[,\s]+'

fruits = re.split(pattern, text)
print("Split text:", fruits)

Split text: ['apple', 'banana', 'cherry', 'orange']


### Removing Punctuation
Punctuation often doesn't contribute meaningfully to text analysis.

In [None]:
import re

text = "Hello, World! NLP is amazing; isn't it?"
clean_text = re.sub(r'[^\w\s]', '', text)  # Removes punctuation
print("Without punctuation:", clean_text)

Without punctuation: Hello World NLP is amazing isnt it


### Removing Numbers
For tasks where numbers are irrelevant, remove them.

In [None]:
text = "The year is 2024, and I have 2 apples."
clean_text = re.sub(r'\d+', '', text)  # Removes digits
print("Without numbers:", clean_text)

Without numbers: The year is , and I have  apples.


### Lowercasing and Whitespace Normalization

In [None]:
text = "   NLP   is   FUN!  "
clean_text = re.sub(r'\s+', ' ', text.strip().lower())  # Normalize whitespace and lowercase
print("Normalized text:", clean_text)

Normalized text: nlp is fun!


### Extracting URLs

In [None]:
text = "Visit https://example.com or http://another-site.org for more info."
urls = re.findall(r'https?://[^\s]+', text)  # Matches HTTP and HTTPS URLs
print("Extracted URLs:", urls)

Extracted URLs: ['https://example.com', 'http://another-site.org']


### Finding Hashtags and Mentions

In [None]:
text = "Analyze #data and follow @data_science!"
hashtags = re.findall(r'#\w+', text)
mentions = re.findall(r'@\w+', text)
print("Hashtags:", hashtags)
print("Mentions:", mentions)

Hashtags: ['#data']
Mentions: ['@data_science']


### Removing HTML Tags

In [None]:
text = "<html><body><p>Hello, World!</p></body></html>"
clean_text = re.sub(r'<.*?>', '', text)
print("Without HTML tags:", clean_text)

Without HTML tags: Hello, World!


## Tokenization

In [None]:
!pip install nltk --upgrade
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

text = "Hi, my name is Muhamad Ali from Data Science Methods"
token = word_tokenize(text)
print(token)



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['Hi', ',', 'my', 'name', 'is', 'Muhamad', 'Ali', 'from', 'Data', 'Science', 'Methods']


## Stop words removal

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

text = "Hi, my name is Muhamad Ali. im a man who working as data scienctist"
tokens = word_tokenize(text)
filtered_tokens = [word for word in tokens if not word in stop_words]
print(filtered_tokens)

['Hi', ',', 'name', 'Muhamad', 'Ali', '.', 'im', 'man', 'working', 'data', 'scienctist']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Stemming and Lemmatization

In [None]:
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
text='running runs ran'
tokens = word_tokenize(text)
stemmed_words = [stemmer.stem(word) for word in tokens]
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print('Stemmed:',stemmed_words)
print('Lemmatized:',lemmatized_words)

Stemmed: ['run', 'run', 'ran']
Lemmatized: ['running', 'run', 'ran']


## N-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

text = ["I love programming in Python"]

# Initialize CountVectorizer with n-gram range
vectorizer = CountVectorizer(ngram_range=(2, 2))  # bigrams
ngrams = vectorizer.fit_transform(text)

# Get feature names (bigrams)
print("Bigrams:", vectorizer.get_feature_names_out())

Bigrams: ['in python' 'love programming' 'programming in']


In [None]:
from nltk.util import ngrams

text = "I love programming in Python"
words = text.split()

# Generate 2-grams (bigrams)
bigrams = list(ngrams(words, 2))
print("Bigrams:", bigrams)

# Generate 3-grams (trigrams)
trigrams = list(ngrams(words, 3))
print("Trigrams:", trigrams)

from nltk.util import ngram

bigram = list(ngram(words,2))
trigram = list(nga)

Bigrams: [('I', 'love'), ('love', 'programming'), ('programming', 'in'), ('in', 'Python')]
Trigrams: [('I', 'love', 'programming'), ('love', 'programming', 'in'), ('programming', 'in', 'Python')]
