# Extract IPs, dates, and email address with regular expressions
* * *
* <font color="red">[0-9]</font> Matches a single digit
* <font color="red">[a-z0-9]</font> Matches a single character that must be a lower case letter or a digit.
* <font color="red">[A-Za-z]</font> Matches a single character that much be a upper/lower case letter 
* <font color="red">\d</font> Matches any decimal digit; equivalent to the set [0-9].
* <font color="red">\D</font> Matches characters that are not digits, which is equivalent to [^0-9] or [^\d].
* <font color="red">\w</font> Matches any alphanumeric character, which is equivalent to [a-zA-Z0-9].
* <font color="red">\W</font> Matches any non-alphanumeric character; which is equivalent to [^a-zA-Z0-9] or [^\w].
* <font color="red">\s</font> Matches any whitespace character; which is equivalent to [\t\n\r\f\v], where \t indicates taps, \n  line feeds, \r carriage returns, \f form feeds and \v vertical tabs.
* <font color="red">\S:</font> Matches any non-whitespace character; which is equivalent to  [^ \t\n\r\f\v].
* <font color="red">ˆ</font> Matches the start of the line.
* <font color="red">$</font> Matches the end of the line.
* <font color="red">.</font> Matches any character (a wildcard).
* <font color="red">*</font> Matches when the preceding character occurs zero or more times
* <font color="red">?</font> Matches when the preceding character occurs zero or one times
* <font color="red">+</font> Matches when the preceding character occurs one or more times

More information can be found here :
https://docs.python.org/2/library/re.html
* * *

In [1]:
import re
import pandas as pd

In [2]:
kata= 'gray grey grAy graay graey greay'

re.findall(r'gr[ae]y',kata)
#character set

['gray', 'grey']

In [3]:
kata= 'XRA 000, 1AA 1AA, ABXQ KKK, A22 XARA' 
re.findall(r'[A-Z0-9][A-Z][A-Z]\s[0-9][A-Z0-9][A-Z0-9]',kata)
#Character range
#cari 7 karater berurutan, di mana karater pertama bisa huruf kapital/ angka; kedua huruf kapital; ketiga huruf kapital
#keempat spasi; kelima angka, keenam angka atau huruf kapital, ketujuh angka atau huruf kapital

['XRA 000', '1AA 1AA']

In [8]:
#negative character
kata= 'hog dog bog fog rog #og'
re.findall(r'[^b]og',kata)
#mencari semua kata yang diakhiri dengan 'og', dan diawali dengan 1 karater apapun kecuali 'b'

['hog', 'dog', 'fog', 'rog', '#og']

In [9]:
#metacharacter
re.findall(r'\d\d\d\d','1998 abcd 1234 efgh')
#\d mencari 1 digit angka 0-9

['1998', '1234']

In [14]:
re.findall(r'\w\w\w','abc a_b AA1 121 #aa')
#\w mencari 1 karakter huruf, angka atau underscore (_)

['abc', 'a_b', 'AA1', '121']

In [15]:
re.findall(r'\w\w\s\w','ab c a bc')
#\s spasi/whitespace

['ab c']

In [17]:
#repetition metacharacter
re.findall(r'aoo*ps','aps aops aoops aooops aoooops aoooooops')
# * berarti 1 karakter sebelum bintang akan diulang minimal 0 kali

['aops', 'aoops', 'aooops', 'aoooops', 'aoooooops']

In [18]:
re.findall(r'aoo+ps','aps aops aoops aooops aoooops aoooooops')
# + berarti 1 karakter sebelum bintang akan diulang minimal 1 kali

['aoops', 'aooops', 'aoooops', 'aoooooops']

In [19]:
re.findall(r'aoo?ps','aps aops aoops aooops aoooops aoooooops')
# ? berarti 1 karakter sebelum bintang akan diulang 0 kali atau 1 kali

['aops', 'aoops']

In [33]:
#quantified repetion --> ditandai dengan {m,n}

re.findall(r'(\b\d{2}\b)','1 12 123 1234 12345 123456 1234567')
#mencari 2 digit angka
#\b Matches the empty string, but only at the beginning or end of a word

['12']

In [23]:
re.findall(r'\d{2,4}','1 12 123 1234 12345 123456 1234567')
#mencari 2-4 digit angka

['12', '123', '1234', '1234', '1234', '56', '1234', '567']

In [24]:
re.findall(r'\d{2,}','1 12 123 1234 12345 123456 1234567')
#mencari minimal 2 digit angka

['12', '123', '1234', '12345', '123456', '1234567']

In [39]:
re.findall(r'\b\d{2,4}_\d{1,2}\b','2018_09 18_8 201809_39 8_9000')

['2018_09', '18_8']

In [41]:
#grouping --> ditandai dengan ()

re.findall(r'\babc+\b','abcc abccc abcabc abcabcabc')


['abcc', 'abccc']

In [47]:
re.findall(r'\b(?:abc)+\b','abcc abccc abcabc abcabcabc')

['abcabc', 'abcabcabc']

In [49]:
#other type --> or ditandai dengan |

re.findall(r'(?:orange|apple) juice','orange juice  apple juice  watermelon juice')

['orange juice', 'apple juice']