# Examples with Regular Expressions

In [2]:
# Import package
import re

In [3]:
# String (data) to experiment with - from water - 
# -- https://www.epa.gov/sdwa/drinking-water-regulations-and-contaminants
data = """
National Secondary Drinking Water Regulations (NSDWRs)
NSDWRs (or secondary standards) are non-enforceable guidelines regulating contaminants that may cause cosmetic effects (such as skin or tooth discoloration) or aesthetic effects (such as taste, odor, or color) in drinking water. EPA recommends secondary standards to water systems but does not require systems to comply with the standard. However, states may choose to adopt them as enforceable standards.
"""
print (data)


National Secondary Drinking Water Regulations (NSDWRs)
NSDWRs (or secondary standards) are non-enforceable guidelines regulating contaminants that may cause cosmetic effects (such as skin or tooth discoloration) or aesthetic effects (such as taste, odor, or color) in drinking water. EPA recommends secondary standards to water systems but does not require systems to comply with the standard. However, states may choose to adopt them as enforceable standards.



In [4]:
# Find all words
pattern = "\w+"
m = re.findall(pattern, data)
print(m)

['National', 'Secondary', 'Drinking', 'Water', 'Regulations', 'NSDWRs', 'NSDWRs', 'or', 'secondary', 'standards', 'are', 'non', 'enforceable', 'guidelines', 'regulating', 'contaminants', 'that', 'may', 'cause', 'cosmetic', 'effects', 'such', 'as', 'skin', 'or', 'tooth', 'discoloration', 'or', 'aesthetic', 'effects', 'such', 'as', 'taste', 'odor', 'or', 'color', 'in', 'drinking', 'water', 'EPA', 'recommends', 'secondary', 'standards', 'to', 'water', 'systems', 'but', 'does', 'not', 'require', 'systems', 'to', 'comply', 'with', 'the', 'standard', 'However', 'states', 'may', 'choose', 'to', 'adopt', 'them', 'as', 'enforceable', 'standards']


In [5]:
# Find unique words
print (set (m))

{'NSDWRs', 'enforceable', 'contaminants', 'choose', 'with', 'Secondary', 'effects', 'guidelines', 'odor', 'or', 'systems', 'states', 'cosmetic', 'tooth', 'in', 'does', 'not', 'taste', 'to', 'require', 'Water', 'them', 'non', 'However', 'that', 'as', 'secondary', 'skin', 'aesthetic', 'such', 'Drinking', 'drinking', 'may', 'adopt', 'standards', 'water', 'comply', 'color', 'National', 'Regulations', 'EPA', 'the', 'recommends', 'are', 'discoloration', 'regulating', 'but', 'standard', 'cause'}


In [6]:
# Find all words in brackets
pattern = "\([a-zA-Z ]+\)"
m = re.findall(pattern, data)
print(m)

['(NSDWRs)', '(or secondary standards)', '(such as skin or tooth discoloration)']


# Discrete Text Representation

In [7]:
# A function that will encode a-z, (, ), -, \n,,, space and '.'
def do_integer_encoding(data):
    # define universe of possible input values
    alphabet = 'abcdefghijklmnopqrstuvwxyz()-, \n.'
    
    # define a mapping of chars to integers
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    int_to_char = dict((i, c) for i, c in enumerate(alphabet))
    
    # integer encoded input data
    integer_encoded = [char_to_int[char] for char in data]
    char_decoded = [int_to_char[integ] for integ in integer_encoded]
    
    return integer_encoded, char_decoded

In [8]:
# See with a sample string
low_data = data.lower()
enc, dec = do_integer_encoding(low_data)
print ("-> data = " + low_data + "\n-> enc = " + str(enc))
print ("-> decoded data: " +  "-> dec = " + "".join(dec))

-> data = 
national secondary drinking water regulations (nsdwrs)
nsdwrs (or secondary standards) are non-enforceable guidelines regulating contaminants that may cause cosmetic effects (such as skin or tooth discoloration) or aesthetic effects (such as taste, odor, or color) in drinking water. epa recommends secondary standards to water systems but does not require systems to comply with the standard. however, states may choose to adopt them as enforceable standards.

-> enc = [31, 13, 0, 19, 8, 14, 13, 0, 11, 30, 18, 4, 2, 14, 13, 3, 0, 17, 24, 30, 3, 17, 8, 13, 10, 8, 13, 6, 30, 22, 0, 19, 4, 17, 30, 17, 4, 6, 20, 11, 0, 19, 8, 14, 13, 18, 30, 26, 13, 18, 3, 22, 17, 18, 27, 31, 13, 18, 3, 22, 17, 18, 30, 26, 14, 17, 30, 18, 4, 2, 14, 13, 3, 0, 17, 24, 30, 18, 19, 0, 13, 3, 0, 17, 3, 18, 27, 30, 0, 17, 4, 30, 13, 14, 13, 28, 4, 13, 5, 14, 17, 2, 4, 0, 1, 11, 4, 30, 6, 20, 8, 3, 4, 11, 8, 13, 4, 18, 30, 17, 4, 6, 20, 11, 0, 19, 8, 13, 6, 30, 2, 14, 13, 19, 0, 12, 8, 13, 0, 13, 19, 18, 

In [9]:
# Scikit has support for label encoding
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.preprocessing import LabelEncoder

# But need an array of words as input

In [11]:
# We get it by getting words from split; a hack
pattern = " "
m = re.split(pattern, data)
print (m)

['\nNational', 'Secondary', 'Drinking', 'Water', 'Regulations', '(NSDWRs)\nNSDWRs', '(or', 'secondary', 'standards)', 'are', 'non-enforceable', 'guidelines', 'regulating', 'contaminants', 'that', 'may', 'cause', 'cosmetic', 'effects', '(such', 'as', 'skin', 'or', 'tooth', 'discoloration)', 'or', 'aesthetic', 'effects', '(such', 'as', 'taste,', 'odor,', 'or', 'color)', 'in', 'drinking', 'water.', 'EPA', 'recommends', 'secondary', 'standards', 'to', 'water', 'systems', 'but', 'does', 'not', 'require', 'systems', 'to', 'comply', 'with', 'the', 'standard.', 'However,', 'states', 'may', 'choose', 'to', 'adopt', 'them', 'as', 'enforceable', 'standards.\n']


In [12]:
# Now we can ask for encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(m)
print(integer_encoded)

[ 0  8  4  9  7  1  2 36 40 12 29 26 34 19 45 28 15 20 24  3 13 37 32 49
 21 32 11 24  3 13 44 31 32 17 27 23 51  5 33 36 39 48 50 43 14 22 30 35
 43 48 18 52 46 38  6 42 28 16 48 10 47 13 25 41]


In [13]:
# And decode
inverted = label_encoder.inverse_transform(integer_encoded)
print(inverted)

['\nNational' 'Secondary' 'Drinking' 'Water' 'Regulations'
 '(NSDWRs)\nNSDWRs' '(or' 'secondary' 'standards)' 'are' 'non-enforceable'
 'guidelines' 'regulating' 'contaminants' 'that' 'may' 'cause' 'cosmetic'
 'effects' '(such' 'as' 'skin' 'or' 'tooth' 'discoloration)' 'or'
 'aesthetic' 'effects' '(such' 'as' 'taste,' 'odor,' 'or' 'color)' 'in'
 'drinking' 'water.' 'EPA' 'recommends' 'secondary' 'standards' 'to'
 'water' 'systems' 'but' 'does' 'not' 'require' 'systems' 'to' 'comply'
 'with' 'the' 'standard.' 'However,' 'states' 'may' 'choose' 'to' 'adopt'
 'them' 'as' 'enforceable' 'standards.\n']
