# Regex without regex 

https://pregex.readthedocs.io/en/latest/introduction.html#usage-example

In [1]:
%pip install pregex

from pregex.core.classes import AnyButWhitespace
from pregex.core.quantifiers import OneOrMore, Optional
from pregex.core.operators import Either

In [2]:
text = """ Here are the full name of some people: Smith, Mr. Robert Johnson, 
Ms Mary Wilson, Mrs. Barbara Taylor, Dr Karen Lewis, Mr. John"""

text

' Here are the full name of some people: Smith, Mr. Robert Johnson, \nMs Mary Wilson, Mrs. Barbara Taylor, Dr Karen Lewis, Mr. John'

In [3]:
family_name = OneOrMore(AnyButWhitespace())
title = Either("Mrs", "Mr", "Ms", "Dr")
given_name = OneOrMore(AnyButWhitespace())

pre = ( family_name + ', ' + title + Optional(".") + ' ' + given_name)
pre.get_matches(text)

['Smith, Mr. Robert',
 'Wilson, Mrs. Barbara',
 'Taylor, Dr Karen',
 'Lewis, Mr. John']

In [4]:
from pregex.core.classes import AnyLetter, AnyDigit, AnyFrom
from pregex.core.quantifiers import Optional, AtLeastAtMost
from pregex.core.operators import Either
from pregex.core.groups import Capture
from pregex.core.pre import Pregex

http_protocol = Optional('http' + Optional('s') + '://')

www = Optional('www.')

alphanum = AnyLetter() | AnyDigit()

domain_name = \
  alphanum + \
  AtLeastAtMost(alphanum | AnyFrom('-', '.'), n=1, m=61) + \
  alphanum

tld = '.' + Either('com', 'org')

ip_octet = AnyDigit().at_least_at_most(n=1, m=3)

port_number = (AnyDigit() - '0') + 3 * AnyDigit()

# Combine sub-patterns together.
pre: Pregex = \
    http_protocol + \
    Either(
        www + Capture(domain_name) + tld,
        3 * (ip_octet + '.') + ip_octet + ':' + port_number
    )

In [5]:
regex = pre.get_pattern()
regex

'(?:https?:\\/\\/)?(?:(?:www\\.)?([\\da-zA-Z][\\da-zA-Z\\-.]{1,61}[\\da-zA-Z])\\.(?:com|org)|(?:\\d{1,3}\\.){3}\\d{1,3}:[1-9]\\d{3})'

In [6]:
text = "text--192.168.1.1:8000--text--http://www.wikipedia.org--text--https://youtube.com--text"
text

'text--192.168.1.1:8000--text--http://www.wikipedia.org--text--https://youtube.com--text'

In [7]:
matches = pre.get_matches(text)
matches

['192.168.1.1:8000', 'http://www.wikipedia.org', 'https://youtube.com']

In [8]:
groups = pre.get_captures(text)
groups

[(None,), ('wikipedia',), ('youtube',)]

In [9]:
from pregex.core.pre import Pregex
from pregex.core.classes import AnyDigit
from pregex.core.operators import Either
from pregex.meta.essentials import HttpUrl, IPv4

In [10]:
port_number = (AnyDigit() - '0') + 3 * AnyDigit()

pre: Pregex = Either(
    HttpUrl(capture_domain=True, is_extensible=True),
    IPv4(is_extensible=True) + ':' + port_number
)

In [11]:
pre.get_matches(text)

['192.168.1.1:8000', 'http://www.wikipedia.org', 'https://youtube.com']