# Regexp

All regexp function include in `re` model.

## 1. Match pattern

- The `re.match(pattern, str, flags=0) -> bool` function check a `str` if it match the `pattern`. The `flags` argument can be:

    - `re.A` (or `re.ASCII`): Make `\w`, `\W`, `\b`, `\B` perform ASCII-only matching instead of full Unicode matching.
    - `re.DEBUG`: Display debug information about compiled expression. No corresponding inline flag.
    - `re.I` (or `re.IGNORECASE`): Ignore case.
    - `re.L` (or `re.LOCALE`): Make `\w`, `\W`, `\b`, `\B` and case-insensitive matching dependent on the current locale.
    - `re.M` (or `re.MULTILINE`): Multi-line match.
    - `re.S` (or `re.DOTALL`): Make the '`.`' special character match any character at all, including a newline; without this flag, '`.`' will match anything except a newline. Corresponds to the inline flag (`?s`).
    - `re.X` (or `re.VERBOSE`): This flag allows you to write regular expressions that look nicer and are more readable by allowing you to visually separate logical sections of the pattern and add comments.

In [None]:
import re


pattern = r"^\(?0\d{2}[\)\-\s]?\d{8}$|^\(?0\d{3}[\)\-\s]?\d{7,8}$|^1\d{10}$"
print(f'* When "pattern = "{pattern}""')

r = re.match(pattern, "(029)85556666")
print(f'  call "re.match(pattern, "(029)85556666")" return: "{r}"')

r = re.match(pattern, "029-85556666")
print(f'  call "re.match(pattern, "029-85556666")" return: "{r}"')

r = re.match(pattern, "029 85556666")
print(f'  call "re.match(pattern, "029 85556666")" return: "{r}"')

r = re.match(pattern, "(0917)8556666")
print(f'  call "re.match(pattern, "(0917)8556666")" return: "{r}"')

r = re.match(pattern, "0917-8556666")
print(f'  call "re.match(pattern, "0917-8556666")" return: "{r}"')

r = re.match(pattern, "0917 8556666")
print(f'  call "re.match(pattern, "0917 8556666")" return: "{r}"')

r = re.match(pattern, "13991300001")
print(f'  call "re.match(pattern, "13991300001")" return: "{r}"')

- Find all matched substring

In [None]:
import re


pattern = r"\d+"
print(f'* When "pattern = "{pattern}""')

r = re.findall(pattern, "123 456 789")
print(f'  call "re.findall(pattern, "123 456 789")" return: "{r}"')

- Find all and get result as iterator

In [None]:
import re


pattern = r"\d+"
print(f'* When "pattern = "{pattern}""')

r = re.finditer(pattern, "123 456 789")
print(f'  call "re.finditer(pattern, "123 456 789")" return: "{r}", and "list(r)" is: {list(r)}')

## 2. Split and substring

- Split string by pattern

In [None]:
import re


pattern = r"\s+"
print(f'* When "pattern = "{pattern}""')

r = re.split(pattern, "abc    def ghi\tjkl", maxsplit=2)
print(f'  call "re.split(pattern, "abc    def ghi\\tjkl", maxsplit=2)" return: "{r}"')

- Find substring

In [None]:
import re


pattern = r"\d+"
print(f'* When "pattern = "{pattern}""')

r = re.sub(pattern, "X", "123a456b789c")
print(f'  call "re.sub(pattern, "X", "123a456b789c")" return: "{r}"')

def repl(mo: re.Match) -> str:
    span = mo.span()
    group = mo.group()
    print(f'  >> "mo.span()" return: "{span}", and "mo.group()" return: "{group}"')
    return "X"


r = re.sub(pattern, repl, "123a456b789c")
print(f'  call "re.sub(pattern, repl, "123a456b789c")" return: "{r}"')

## 3. Search and grouping 

In [None]:
import re


pattern = r"(?P<n1>\d+),(\d+),(?P<n2>\d+),(\d+)"
print(f'* When "pattern = "{pattern}""')

rs = re.search(pattern, "10,20,30,50")
print(f'  and "rs = re.search(pattern, "10,20,30,50")", "rs={rs}"')

r = rs.groups()  # Get all groups of search result
print(f'  then call "rs.groups()" return: "{r}"')

r = rs.group(1)
print(f'  then call "rs.groups(1)" return: "{r}"')

r = rs.group(4)
print(f'  then call "rs.groups(4)" return: "{r}"')

r = rs.group("n2")  # Get named group
print(f'  then call "rs.groups("n2")" return: "{r}"')

r = rs.start(1)  # Get the start position of group 1
print(f'  then call "rs.start(1)" return: "{r}"')

r = rs.end(1)  # Get the end position of group 1
print(f'  then call "rs.end(1)" return: "{r}"')

r = rs.groupdict()  # Get all named group as dictionary format
print(f'  then call "rs.groupdict()" return: "{r}"')

## 4. Compile the regepx

- Compile string pattern

In [None]:
from typing import List
import re


pattern = r"\s+"
print(f'* When "pattern = "{pattern}""')

rx = re.compile(pattern)
print(f'  and "rx = re.compile(pattern)", "rx={rx}"')

r = rx.match("\t")
print(f'  then call "rx.match("\\t")" return: "{r}"')

r = rx.findall("1 2\t3  4")
print(f'  then call "rx.findall("1 2\\t3  4")" return: "{r}"')

r = rx.split("1 2\t3  4")
print(f'  then call "rx.split("1 2\\t3  4")" return: "{r}"')

r = rx.sub("-", "1 2\t3  4")
print(f'  then call "rx.sub("1 2\\t3  4")" return: "{r}"')


class Repl:
    def __init__(self) -> None:
        self._list: List[str] = []

    def __call__(self, mo: re.Match) -> str:
        self._list.append(mo.group())
        return "-"

    @property
    def list(self):
        return self._list


repl = Repl()

r = rx.sub(repl, "1 2\t3  4")
print(f'  then call "rx.sub(repl, "1 2\\t3  4")" return: "{r}", and "repl.list" return: "{repl.list}"')

## 5. Escape string

- escpae pattern string

In [None]:
import re


esp = re.escape(r"[-\]")
print(f'* When "esp = re.escape(r"[-\]")", "esp={esp}"')

pattern = f"[{esp}]"
print(f'  and "pattern = f"[{esp}]"", pattern="{pattern}"')

rx = re.compile(pattern)
print(f'  and "rx = re.compile(pattern)", "rx={rx}"')

r = rx.findall(r"-\]a[")
print(f'  then call "rx.findall(r"-\]a[")" return "rx={r}"')

- Use `sub` and `escape` to do replacement

In [None]:
import re


characters = {
    "a": "A",
    "b": "B",
    "c": "C"
}

print(f'* When "characters={characters}"')

pattern = "|".join(map(re.escape, characters))
print(f'  then "pattern = "|".join(map(re.escape, characters))", pattern="{pattern}"')


rx = re.compile(pattern)
print(f'  then "rx = re.compile(pattern)", rx="{rx}"')

def replace(mo: re.Match) -> str:
    return characters[mo.group(0)]


r = rx.sub(replace, "abcde")
print(f'  then call "rx.sub(replace, "abcde")" return: "{r}"')