# Język Python - Wykład 5.

## Wyrażenia regularne

In [None]:
import re

<img width=800 src="files/L5_img/automata.png">

<img width=300 src="files/L5_img/plus.png">

## Metaznaki

## Użycie

In [None]:
p = re.compile('ab|bc')
print p

In [None]:
print p.match('bc')

In [None]:
print p.match('ba')

In [None]:
re.search('b','abc')

In [None]:
re.match('b','abc')

In [None]:
re.match('a','abc')

In [None]:
re.split(r'\W+', 'Words, words, words.')

In [None]:
re.split(r'(\W+)', 'Words, words, words.')

In [None]:
p = re.compile('(ab)*')
p.match('ababababab').span()

In [None]:
p = re.compile('a/{,3}b')
print p.match('ab')
print p.match('a/b')
print p.match('a//b')
print p.match('a///b')
print p.match('a////b')

In [None]:
print re.search(r'\bfoo\b','bar foo baz').group()

In [None]:
print re.findall(r'[0-9]','a1b2c3d4e5')

In [None]:
for m in re.finditer(r'[0-9]','a1b2c3d4e5'):
    print m.group(0),m.span()

## Notacja „raw string”

In [None]:
re.match(r"\W(.)\1\W", " ff ").group()

In [None]:
re.match("\\W(.)\\1\\W", " ff ").group()

## Niezachłanne dopasowania

In [None]:
s = '<html><head><title>Title</title>'

In [None]:
print re.match('<.*>', s).group()

In [None]:
print re.match('<.*?>', s).group()

## Grupowanie

In [None]:
p = re.compile('(a(b)c)d')
m = p.match('abcd')

In [None]:
m.group(0)

In [None]:
m.group(1)

In [None]:
m.group(2)

In [None]:
#Match dates formatted like MM/DD/YYYY, MM-DD-YY,...
import re

date = '01/12/2013'

regex = re.compile(r'^(?P<day>\d\d)[-/](?P<month>\d\d)[-/](?P<year>\d\d(?:\d\d)?)$')

match = regex.match(date)

In [None]:
print match.group(0),match.group(1),match.group(2),match.group(3)

In [None]:
print match.group('day'),match.group('month'),match.group('year')

In [None]:
print match.groups()

In [None]:
print regex.groupindex

In [None]:
re.match(r'(ala) \1','ala ala').group(0)

## Extension Notation (?...)

### Powielenie (?P=<name>)

In [None]:
p = re.compile(r'(?P<word>\b\w+)\s+(?P=word)')

In [None]:
p.match('ala ala').group()

In [None]:
p = re.compile(r'(?P<word>\b\w+)\s+\1')

In [None]:
p.match('ala ala').group()

### Nieprzechwytująca grupa (?:foo)

In [None]:
re.match(r'(\w+@\w+(?:\.\w+)+)','korzycki@agh.edu.pl').groups()

### Positive Lookahead (?=...)

In [None]:
re.match(r'(\w+(?=@\w+(?:.\w+)+))','korzycki@agh.edu.pl').group()

### Negative Lookahead (?!...)

In [1]:
s='plik.py'
re.match(r'(.*)[.](?!bat$|exe$).*$',s).group(1)

NameError: name 're' is not defined

### Positive Lookbehind

In [None]:
s='From: korzycki@agh.edu.pl'
re.search(r'(?<=From: )(.*)',s).group()

### Negative Lookbehind 

In [None]:
s='From: korzycki@agh.edu.pl'
re.search(r'(?<!^Subject)(?<=: )(.*)',s).group()

### Warunkowe wyrażenie (?(id/name)yes-pattern|no-pattern)

In [None]:
import re
p = re.compile('(<)?(\w+@\w+(?:\.\w+)+)(?(1)>)')

In [None]:
p.match('<user@host.com>').group()

In [None]:
p.match('user@host.com').group()

In [None]:
print p.match('<user@host.com')

## Flagi

### re.VERBOSE

In [None]:
pat = re.compile(r"\s*(?P<header>[^:]+)\s*:(?P<value>.*?)\s*$")

In [None]:
pat.match('From: korzycki@agh.edu.pl').groups()

In [None]:
pat = re.compile(r"""
    \s*               # Skip leading whitespace
    (?P<header>[^:]+) # Header name
    \s* :             # Whitespace, and a colon
    (?P<value>.*?)    # The header's value -- *? used to
                      # lose the following trailing whitespace
    \s*$
""", re.VERBOSE)


In [None]:
pat.match('From: korzycki@agh.edu.pl').groups()

### re.IGNORECASE

In [None]:
re.match(r'(?i)ala','Ala').group()

### re.MULTILINE re.DOTALL

In [None]:
s="""Ala ma
kota"""

print re.match(r'^A.*ta$',s)
print re.match(r'(?ms)^A.*ta$',s)

## Podstawienie

In [None]:
#urlify - turn URLs into HTML links
import re

text = 'Check the web site, http://www.oreilly.com/catalog/regexppr.'

pattern = r'''
    \b                                    # start at word boundary
    (                                     # capture to \1
    (https?|telnet|gopher|file|wais|ftp) :# resource and colon
    [\w/#~:.?+=&%@!\-] +?                 # one or more valid chars
                                          # take little as possible
    )
    (?=                                   # lookahead
    [.:?\-] *                             # for possible punc
    (?: [^\w/#~:.?+=&%@!\-]               # invalid character
    | $ )                                 # or end of string
    )'''

regex = re.compile(pattern,re.IGNORECASE + re.VERBOSE)
result = regex.sub(r'<a href="\1">\1</a>', text)

print result

### Podstawienie funkcją

In [None]:
import random
def repl(m):
    inner_word = list(m.group(2))
    random.shuffle(inner_word)
    return m.group(1) + "".join(inner_word) + m.group(3)


In [None]:
text = "Professor Abdolmalek, please report your absences promptly."
re.sub("(\w)(\w+)(\w)", repl, text)

In [None]:
indent = lambda s: reduce(min,map(len,re.findall('(?m)^ *(?=\S)',s)))
flush_left = lambda s: re.sub('(?m)^ {%d}' % indent(s),'',s)

In [None]:
s="""    
    Bardzo
       roznie
     wciety
    tekst 
"""
print s
print
print flush_left(s)