<a href="https://colab.research.google.com/github/aynaval/NLP-basics/blob/main/Regex_guide.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re # library for regular expression

In [3]:
text = 'I have called the service desk 100 times and nobody replies to me. I need a conversation ASAP!! My number is 111-1234567!'
result = re.findall('111-1234567', text)
result

['111-1234567']

In [5]:
result = re.findall(f'\d',text)
result

# r is here means raw text, ie telling python to not treat '\' as escape charecter but treat it as normal charecter.

['1', '0', '0', '1', '1', '1', '1', '2', '3', '4', '5', '6', '7']

In [8]:
print(r'hello \nworld!')
print('hello \nworld!')

hello \nworld!
hello 
world!


In [10]:
result = re.findall(r'\d+', text) 
result

# + is kleene star plus which means one or more

['100', '111', '1234567']

In [12]:
result = re.findall(r'\d{3}-\d{7}',text)
result

# {number} means that minimum that many numbe of times

['111-1234567']

In [13]:
text = 'I have called the service desk 100 times and nobody replies to me. I need a conversation ASAP!! My number is 111-1234567! My other number is 7654321!'

In [14]:
result = re.findall(r'\d{3}-d{7}|\d{7}',text)
result 


# | = or gate

['1234567', '7654321']

In [17]:
text = '111-1234567! That is my number! The other one is 7654321!'

In [19]:
result = re.findall(r'^\d{3}-\d{7}|^\d{7}',text)
result

#^ = beginning of the sentence 

['111-1234567']

# Regex basic guide

## 1. Literal charecters



*   Pattren : [abc]
*   Matches : Any character a,b or c.
*   Examples : [aeiou] matches any vowel character.



In [24]:
pattren = r'[aeiou]'
text = 'Hello World'
result = re.findall(pattren,text)
result

['e', 'o', 'o']

## 2. Character classes
* pattern : \d, \w , \s 
* Matches : Any digit(\d), word character (\w) or whitespace character (\s)
* Example : \d{3} matches any three numbers

In [25]:
pattren = r'\d{3}'
text = 'The code id 123 and the number is 4567.'
result = re.findall(pattren,text)
result

['123', '456']

## 3. Negated character classes
* Pattern : [^abc]
* Matches : Any character except 'a', 'b', or 'c'.
* Example : [^0-9] matches any non-digit character.

In [26]:
pattren = r'[^0-9]'
text = 'abc123'
result = re.findall(pattren,text)
result

['a', 'b', 'c']

## 4. Quantifiers
* Pattern : *, +, ?, {m}, {m,n}
* Matches : Zero or more (*), one or more (+), zero or one (?), exactly m occurrences ({m}), between m and n occurrences ({m,n}).
* Example : \d{2,4} matches a 2 to 4-digit number.

In [32]:
pattren = r'\d{2,4}'
text = 'abc123'
result = re.findall(pattren,text)
result


['123']

## 5. Anchors

* Pattern : ^, \$
* Matches : The start (^) or end ($) of a line.

* Example : ^\d{3}$ matches a three-digit number that occupies the entire line.

In [35]:
pattern = r'^\d{3}$'
text = "123"
result = re.findall(pattern, text)
print(result)

['123']


## 6. Alternation:

* Pattern : |
* Matches : Either the pattern before or after the |.
* Example : cat|dog matches either "cat" or "dog".

In [36]:
pattern = r'cat|dog'
text = "I have a cat and a dog."
result = re.findall(pattern, text)
print(result)

['cat', 'dog']


## 7.Grouping and Capturing

* Pattern : (...)
* Matches : The pattern inside the parentheses as a group and captures it for later use.
* Example : (ab)+ matches one or more occurrences of "ab"

In [37]:
  pattern = r'cat|dog'
text = "I have a cat and a dog."
result = re.findall(pattern, text)
print(result)

['cat', 'dog']


## 8. Greedy and Lazy Matching

* Pattern : .*, .*?
* Matches : Greedy matching (.*) matches as much as possible, while lazy matching (.*?) matches as little as possible.
* Example : a.*b matches the longest string starting with 'a' and ending with 'b'.

In [38]:
pattern = r'a.*b'
text = "aabbbb"
result = re.findall(pattern, text)
print(result)  


['aabbbb']


## 9. Backreferences

* Pattern : \1, \2, etc.
* Matches : The contents of a previous capturing group.
* Example : (a)\1 matches "aa".

In [40]:
pattern = r'(a)\1'
text = "aa"
result = re.findall(pattern, text)
print(result)

['a']


# 10. Lookaheads and Lookbehinds

* Pattern : (?=...), (?<=...)
* Matches : A pattern only if it is followed by ((?=...)) or preceded by ((?<=...)) another pattern.
* Example: (?=John)Doe matches "Doe" only if it is followed by "John".


In [41]:
pattern = r'(?=John)Doe'
text = "JohnDoe"
result = re.findall(pattern, text)
print(result)

[]
