# RE en Python
https://docs.python.org/2/library/re.html

In [1]:
#Python
import re

In [2]:
#Los patrones se escriben como Raw Text:
pattern = r"[wW]oodchucks?|[gG]roundhogs?"
pattern

'[wW]oodchucks?|[gG]roundhogs?'

In [3]:
#Se pueden buscar las coincidencias:
re.findall(pattern, "Woodchuck: the woodchucks are also known as groundhogs")

['Woodchuck', 'woodchucks', 'groundhogs']

In [4]:
#Se pueden reemplazar directamente:
re.sub(pattern, "MATCH","Woodchuck: the woodchucks are also known as groundhogs")

'MATCH: the MATCH are also known as MATCH'

In [None]:
#Otras funciones: (NO EJECUTAR)
re.compile
re.match
re.search

<b>Precedencia de operadores en Regex:
<br><img src="Precedence.png" width="300" align="left">

### Ejemplo: "the"

Queremos extraer el término "the":

In [5]:
text = "The theology and the practice of the other"

In [6]:
# Primer intento:
re.sub(r"the","*", text)

'The *ology and * practice of * o*r'

In [7]:
# Segundo intento:
re.sub(r"[tT]he","*", text)

'* *ology and * practice of * o*r'

In [8]:
# Tercer intento:
re.sub(r"\b[tT]he\b","*", text)

'* theology and * practice of * other'

In [9]:
# Sin el anchor '\b':
re.sub(r"(^|[^a-zA-Z])[tT]he([^a-zA-Z]|$)","*", text)

'*theology and*practice of*other'

In [10]:
# Sin el anchor '\b', usando "look behind" o "look after":
re.sub(r"(^|(?<=[^a-zA-Z]))[tT]he(?=[^a-zA-Z]|$)","*", text)

'* theology and * practice of * other'

<b>Límites sin usar \b (word boundaries):
<br>(Es útil porque '\b' no funciona cuando encierras términos no alfanuméricos, p.e. un precio con signo $)

In [None]:
#Inicio: (?:^|(?<=\s)) o (?:^|(?<!\w))
#Fin: (?=\s|$)

### Ejercicio: Precision, Recall
<img src="https://upload.wikimedia.org/wikipedia/commons/2/26/Precisionrecall.svg" align="left">

In [11]:
def precision(TP,FP):
    return 1.*TP/(TP+FP)

def recall(TP,FN):
    return 1.*TP/(TP+FN)

def accuracy(TP,FP,TN,FN):
    return 1.*(TP+TN)/(TP+FP+TN+FN)

<b>¿Cuál es la precisión y recall en...?

In [12]:
text = "The theology and the practice of the other"
n_true = 3 #The the the
n_false = 2 #theology other

In [13]:
#(1)
re.sub(r"the","*", text)

'The *ology and * practice of * o*r'

In [14]:
true_positives = 2 #the the
false_positives = 2 #theology other
true_negatives = 0
false_negatives = 1 #The

In [16]:
print("Precision:",precision(true_positives, false_positives))
print("Recall:",recall(true_positives, false_negatives))
print("Accuracy:",accuracy(true_positives, false_positives, true_negatives, false_negatives))

Precision: 0.5
Recall: 0.6666666666666666
Accuracy: 0.4


In [17]:
#(2)
re.sub(r"[tT]he","*", text)

'* *ology and * practice of * o*r'

In [18]:
true_positives = 3 #The the the
false_positives = 2 #theology other
true_negatives = 0
false_negatives = 0

In [19]:
print("Precision:",precision(true_positives, false_positives))
print("Recall:",recall(true_positives, false_negatives))
print("Accuracy:",accuracy(true_positives, false_positives, true_negatives, false_negatives))

Precision: 0.6
Recall: 1.0
Accuracy: 0.6


In [None]:
re.sub(r"\b[tT]he\b","*", text)

In [None]:
true_positives = 3 #The the the
false_positives = 0 
true_negatives = 2 #theology other
false_negatives = 0

In [None]:
print "Precision:",precision(true_positives, false_positives)
print "Recall:",recall(true_positives, false_negatives)
print "Accuracy:",accuracy(true_positives, false_positives, true_negatives, false_negatives)

### Ejercicio: Búsqueda con RE
Quiero comprar una computadora con las siguientes especificaciones...

In [20]:
query = "cualquier computador con procesador de 6 GHz y 500 GB de almacenamiento por menos de $1000.01"

In [21]:
#Precio:
re.findall(r"\$[0-9]+",query)

['$1000']

In [22]:
#Precio con decimales?:
re.findall(r"(\$[0-9]+)(.[0-9][0-9])?",query)

[('$1000', '.01')]

In [None]:
#Limitando el término:
re.findall(r"",query)
#Fuente: https://stackoverflow.com/questions/18425386/re-findall-not-returning-full-match

In [None]:
#Reloj del procesador?
re.findall(r"...(GHz|[gG]igahertz)", query)

In [None]:
#Almacenamiento?

### Aplicación: ELIZA

<img src="ELIZA.png" width="600" align="left">
<img src="ELIZA_answers.png" width="600" align="left">

In [23]:
user1 = "Men are all alike"
user2 = "They're always bugging as about something or other"
user3 = "Well, my boyfriend made me come here"
user4 = "He says I'm depressed much of the time"

<b>Responder a los usuarios usando RE ("search" y "group"):

In [25]:
print(user1)
m1 = re.search(r"all",user1)
if m1:
    print("IN WHAT WAY?")
print(m1)

Men are all alike
IN WHAT WAY?
<_sre.SRE_Match object; span=(8, 11), match='all'>


In [26]:
m4 = re.search(r"(I'm|I am) (depressed|sad)",user4)
if m4:
    print("WHY DO YOU THINK YOU ARE " + m4.group(2).upper() + '?')

WHY DO YOU THINK YOU ARE DEPRESSED?


In [27]:
def ELIZA(user_text):
    m1 = re.search(r"\ball\b",user_text)
    if m1:
        print("IN WHAT WAY?")
    m2 = re.search(r"\balways\b",user_text)
    if m2:
        print("CAN YOU THINK OF A SPECIFIC EXAMPLE")
    m4 = re.search(r"(I'm|I am) (depressed|sad)",user_text)
    if m4:
        print("WHY DO YOU THINK YOU ARE " + m4.group(2).upper() + '?')

In [31]:
ELIZA("I'm sad")

WHY DO YOU THINK YOU ARE SAD?


In [29]:
ELIZA("He says I'm depressed much of the time")

WHY DO YOU THINK YOU ARE DEPRESSED?


In [30]:
ELIZA("They're always bugging as about something or other")

CAN YOU THINK OF A SPECIFIC EXAMPLE


In [46]:
text = "This is a test text text this"

In [52]:
tokens = text.lower().split(' ')

In [53]:
num_tokens = len(tokens)

In [54]:
V = len(list(set(tokens)))

In [55]:
from collections import Counter

In [56]:
Counter(tokens)

Counter({'a': 1, 'is': 1, 'test': 1, 'text': 2, 'this': 2})