# Natural Language Processing - Regular expressions

In the following we explore to a small extent the notion of regular expressions in Natural language processing (NLP). 

NLP deals with the interaction between natural human languages and computers. It is an important aspect of machine learning.

Regular expressions are a method of precisely describing patterns of text. A regular expression corresponds to a specific text string with the purpose of constructing search patterns on selections of text.

In [207]:
#import necessary packages
import re

## 1) Warm-up: Extracting simple expressions from strings

In [13]:
#check if integers occur at all
#define string
string = "For 5 dumplings I have paid 6.50$."

string_to_be_checked_for = "[1,5,6,9]" #alternatively for intervals "[0-9]", "[0-3,6-9]" etc.
p = re.compile(string_to_be_checked_for)

#to check if the string to be checked for is part of the above string
print(p.findall(string))

['5', '5']


In [9]:
#to see that a list of strings is reported
type(p.findall(string))

list

In [31]:
#looking for groups of numbers with spaces in strings add +

strings = ["For 1 dumpling I have paid 6.50$.", "For 5 dumplings I have paid 5.00€.", "For 15 dumplings I have paid 12.00€."]

string_to_be_checked_for = " [0-9]+ "

p = re.compile(string_to_be_checked_for)

for string in strings:
    #to check if the string to be checked for is part of the above list of strings
    print(p.findall(string))

[' 1 ']
[' 5 ']
[' 15 ']


In [41]:
#looking for restricted groups of numbers 

strings = ["For 1 dumpling I have paid 6.50999999$.",
           "For 5 dumplings I have paid 5.00€.",
           "For 15 dumplings I have paid 12.00€."]

string_to_be_checked_for = "[0-9]+[\.,][0-9]{2}[€$]"

p = re.compile(string_to_be_checked_for)

for string in strings:
    #to check if the string to be checked for is part of the above list of strings
    print(p.findall(string))

[]
['5.00€']
['12.00€']


In [33]:
#check for (more complex) expressions with numbers (like prices)
#define string
strings = ["For 5 dumplings I have paid 6.50$.", "For 5 dumplings I have paid 5,00€.", "For 15 dumplings I have paid 12.00€."]

string_to_be_checked_for = "[0-9]+[\.,][0-9]+[€$]"

p = re.compile(string_to_be_checked_for)

pricelist = []

for string in strings:
    #to check if the string to be checked for is part of the above list of strings
    print(p.findall(string))
    #append to price list
    pricelist.append(p.findall(string)[0])
    
print("The price list is: " + str(pricelist))

['6.50$']
['5,00€']
['12.00€']
The price list is: ['6.50$', '5,00€', '12.00€']


In [36]:
#if a symbol is optional (like currency symbol) add ?

strings = ["For 5 dumplings I have paid 6.50.", 
           "For 5 dumplings I have paid 5,00€.", 
           "For 15 dumplings I have paid 12.00€.",
           "For 13 dumplings I have paid 12.00$.",
          ]

string_to_be_checked_for = "[0-9]+[\.,][0-9]+[€$]?"

p = re.compile(string_to_be_checked_for)

pricelist = []

for string in strings:
    #to check if the string to be checked for is part of the above list of strings
    print(p.findall(string))
    pricelist.append(p.findall(string)[0])
    
print("The price list is: " + str(pricelist))

['6.50']
['5,00€']
['12.00€']
['12.00$']
The price list is: ['6.50', '5,00€', '12.00€', '12.00$']


## 2) Extracting e-mail addresses

In [60]:
#first/crude attempt
string_to_be_checked_for = "[a-zA-Z0-9]+[@][a-zA-Z0-9]+[\.]{1}[a-zA-Z]+"

p = re.compile(string_to_be_checked_for)

sentences = ["My e-email address is hello@world.com",
             "Please, don't write e-mails to info@world.com",
             "On Saturday there will be an event at the Hotel Lala. If you would like to attend, send an email to event@lala.com"
             "Hello hello."
            ]

emaillist = []
for string in sentences:
    #to check if the string to be checked for is part of the above list of strings
    print(p.findall(string))
    emaillist.append(p.findall(string)[0])
    
print("The e-mail list is: " + str(emaillist))

['hello@world.com']
['info@world.com']
['event@lala.comHello']
The e-mail list is: ['hello@world.com', 'info@world.com', 'event@lala.comHello']


## 3) Extracting dates


In [171]:
date_to_be_checked_for1 = "[0-9]{4}[\/][0-9]{2}[\/][0-9]{2}"
date_to_be_checked_for2 = "[0-9]{2}[\.][0-9]{2}[\.][0-9]{4}"
date_to_be_checked_for3 = "[0-9]{2}[\/][0-9]{4}"
date_to_be_checked_for4 = "[0-9]{2}[\/][0-9]{2}[\/][0-9]{4}"

p1 = re.compile(date_to_be_checked_for1)
p2 = re.compile(date_to_be_checked_for2)
p3 = re.compile(date_to_be_checked_for3)
p4 = re.compile(date_to_be_checked_for4)

dateformats = [p1, p2, p3, p4]

sentences = ["The appointment is on 2018/14/05.",
             "The other apppointment should be scheduled for 07/2018",
             "On 05.06.2011 there was a major fire."
             "You've just won the jackpot of 10.000.000$."
             "There is a cool event on 04.06.2019."
             #"Tomorrow, 25/02/1999, there will be a celebration."
            ]

datelist1 = []
for string in sentences:
     for p in dateformats:
        finding = p.findall(string)
        if finding:
            #print(p.findall(string))
            datelist1.append(p.findall(string))
            
datelist = []
for sublist in datelist1:
    for val in sublist:
        datelist.append(val)

print("The list of dates is: " + str(datelist))

The list of dates is: ['2018/14/05', '07/2018', '05.06.2011', '04.06.2019']


In [206]:
#if the way to write dates is truely finite and we would like to uniformize it

from datetime import datetime

date_to_be_checked_for1 = "[0-9]{4}[\/][0-9]{2}[\/][0-9]{2}"
date_to_be_checked_for2 = "[0-9]{2}[\.][0-9]{2}[\.][0-9]{4}"
date_to_be_checked_for3 = "[0-9]{2}[\/][0-9]{4}"
date_to_be_checked_for4 = "[0-9]{2}[\/][0-9]{2}[\/][0-9]{4}"

p1 = re.compile(date_to_be_checked_for1)
p2 = re.compile(date_to_be_checked_for2)
p3 = re.compile(date_to_be_checked_for3)
p4 = re.compile(date_to_be_checked_for4)

dates = [] 

for string in sentences:
    
    finding1 = p1.findall(string)
    finding2 = p2.findall(string)
    finding3 = p3.findall(string)
    finding4 = p4.findall(string)
   
    if finding1:
        for i in range(0,len(finding1)):
            dates.append(datetime.strptime((finding1[i]), "%Y/%d/%m"))
    
    elif finding2:
        for i in range(0,len(finding2)):
            dates.append(datetime.strptime((finding2[i]), "%d.%m.%Y"))
        
    elif finding3:
        for i in range(0,len(finding3)):
            dates.append(datetime.strptime((finding3[i]), "%m/%Y"))
        
    elif finding4:
        for i in range(0,len(finding4)):
            dates.append(datetime.strptime((finding4[i]), "%d/%m/%Y"))

#to convert into European date formate 
for date in dates:
    print(date.strftime("%d.%m.%Y"))

14.05.2018
01.07.2018
05.06.2011
04.06.2019
