# RE 

### 01: PATTERN LITERALS & FLAGS (syntax forms)

 raw string literal for patterns: r"pattern"

 compile with flags: re.compile(pattern, flags)

 flags examples: re.I, re.M, re.S, re.X



### 02: CORE FUNCTIONS (signature / return type)

 re.search(pattern, string, flags=0) -> Match | None

 re.match(pattern, string, flags=0) -> Match | None

 re.fullmatch(pattern, string, flags=0) -> Match | None

 re.findall(pattern, string, flags=0) -> list[str] or list[tuple]

 re.finditer(pattern, string, flags=0) -> iterator[Match]

 re.sub(pattern, repl, string, count=0, flags=0) -> str

 re.subn(pattern, repl, string, count=0, flags=0) -> (str, int)

 re.split(pattern, string, maxsplit=0, flags=0) -> list[str]

 re.compile(pattern, flags=0) -> Pattern

----

 Pattern (Pattern object) methods (same signatures without flags arg):

 Pattern.search(string) -> Match | None

 Pattern.match(string) -> Match | None

 Pattern.fullmatch(string) -> Match | None

 Pattern.findall(string) -> list

 Pattern.finditer(string) -> iterator[Match]

 Pattern.sub(repl, string, count=0) -> str

 Pattern.subn(repl, string, count=0) -> (str, int)

 Pattern.split(string, maxsplit=0) -> list[str]



#### 03: MATCH OBJECT API (methods / returns)

 m = re.search(...)

 m.group([group1, ...]) -> str | tuple

 m.groups(default=None) -> tuple

 m.groupdict() -> dict[name->str]

 m.start([group]) -> int

 m.end([group]) -> int

 m.span([group]) -> (int, int)



 
#### 04: CHARACTER CLASSES & ESCAPES (syntax examples)

 .         -> any char except newline

 \d        -> digit [0-9]

 \D        -> non-digit

 \w        -> word char [A-Za-z0-9_]

 \W        -> non-word

 \s        -> whitespace

 \S        -> non-whitespace

 [abc]     -> any of a, b, or c

 [a-z]     -> range

 [^a-z]    -> negated class

 \\       -> literal backslash

 \t \n \r-> tab, newline, carriage-return




#### 05: QUANTIFIERS (syntax)

 X*        -> 0 or more (greedy)

 X+        -> 1 or more (greedy)

 X?        -> 0 or 1

 X{m}      -> exactly m

 X{m,}     -> m or more

 X{m,n}    -> between m and n (inclusive)

 Make lazy (non-greedy): append ? -> *? +? ?? {m,n}?




 #### 06: GROUPS (syntax)

 ( ... )           -> capturing group

 (?: ... )         -> non-capturing group

 (?P<name> ... )   -> named capturing group

 (?P=name)         -> backreference to named group

 \1 \2           -> backreference to numbered group



 
#### 07: ANCHORS & BOUNDARIES

 ^         -> start of string (or line with re.M)

 $         -> end of string (or line with re.M)

 \b        -> word boundary

 \B        -> non-word-boundary




#### 08: ALTERNATION & GROUPING

 A|B       -> match A or B

 (a|b)c    -> group alternation





 ##### 09: LOOKAROUNDS (zero-width assertions)

 (?=...)   -> positive lookahead (A(?=B))

 (?!...)   -> negative lookahead (A(?!B))

 (?<=...)  -> positive lookbehind (?<=B)A

 (?<!...)  -> negative lookbehind (?<!B)A

 (Note: lookbehind must be fixed-width in 're')





##### 11: COMMON USEFUL PATTERNS (syntax only)

 Email: r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"

 URL: r"https?://[^\s]+"  or r"https?://\S+"

 Integer: r"[+-]?\d+"

 Float: r"[+-]?\d+(?:\.\d+)?"

 Word token: r"\w+"

 Whitespace collapse: r"\s+"

 HTML tag (simple): r"<.*?>"

 HTML tag (balanced, not solved by re): use parser




#### 12: QUICK REFERENCE (one-line cheats)

 remove punctuation: re.sub(r"[^\w\s]", "", text)

 tokenize words: re.findall(r"\w+", text)

 find spans: for m in re.finditer(r"\w+", text): m.span()

 replace numbers: re.sub(r"\d+", "<NUM>", text)

 split by commas or spaces: re.split(r"[ ,]+", text)


In [1]:
import re

## re.search()

✔ Looks anywhere in the string

✔ Stops after finding the FIRST match

✔ Returns: Match object OR None

In [2]:
m=re.search("cat", "my cat is cute")

In [3]:
m

<re.Match object; span=(3, 6), match='cat'>

In [4]:
m=re.search("apple", "my cat is cute")

In [5]:
m

## re.match()

✔ Only checks the BEGINNING of the string

✔ Returns: Match object OR None

In [6]:
re.match("cat", "cat is here")

<re.Match object; span=(0, 3), match='cat'>

In [7]:
m=re.match("cat", "my cat is here")

In [8]:
m

## re.fullmatch()

✔ Pattern must match the ENTIRE string

In [9]:
re.fullmatch(r"\d+", "123")

<re.Match object; span=(0, 3), match='123'>

In [10]:
m=re.fullmatch(r"\d+", "123")

In [11]:
m

<re.Match object; span=(0, 3), match='123'>

## re.findall()

✔ Returns a list of all matched substrings

✔ No positions

❌ Does NOT return Match objects

In [12]:
re.findall(r"\d+", "a1 b22 c333")

['1', '22', '333']

In [13]:
m=re.findall(r"\d+", "a1 b221p1 c333")

In [14]:
m

['1', '221', '1', '333']

## re.finditer()

✔ Returns an iterator of Match objects

In [15]:
for m in re.finditer(r"\d+", "a1 b22 c333"):
    print(m.group(), m.start(), m.end())


1 1 2
22 4 6
333 8 11


## re.sub()

✔ Replaces matched text with something else

In [16]:
re.sub(r"\d+", "<NUM>", "I have 2 cats and 3 dogs")

'I have <NUM> cats and <NUM> dogs'

## re.split()

✔ Splits string USING a regex as delimiter

In [17]:
re.split(r"\s+", "I   love   NLP")

['I', 'love', 'NLP']

# re.compile()

✔ re.compile() creates a REGEX OBJECT

This object stores:

* your pattern

* settings / flags

* optimized internal regex engine instructions

✔ Instead of running the pattern every time, Python prepares it ONCE.

Then you reuse it:

In [18]:
pattern = re.compile(r"\d+")
pattern.findall("a1 b22")

['1', '22']

In [19]:
pattern.findall("c333 d4444")

['333', '4444']