## 1. REGULAR EXPRESSIONS: the re module
This module is used for pattern search: and regular expressions.
### 1.1 re.match()
re.match(w1, w2) returns true if "w1" can be found at the beginning of w2

In [1]:
import random
import string
import re
import numpy as np

In [2]:
# this method returns a random string with a given length
def random_string(str_len=10, all_lower=False, all_upper=False, characters=None):
    assert not all_lower or not all_upper
    
    if characters is None:
        characters = string.ascii_letters
    
    ran_str =  "".join([str(random.choice(characters)) for _ in range(str_len)])
    
    if all_lower :
        return ran_str.lower()
    if all_upper:
        return ran_str.upper() 
    
    return ran_str


In [3]:
print(re.match('hedge', 'hedgehog') is None) # which means "hedgehog" starts with "hedge"

False


### 1.2 Special characters
#### 1.2.1 Dot character
This character matches every character beside the new line characeters "\n", "\r"...
#### 1.2.2 The ? character
when written after a character, it means this character can be absent.

In [4]:
regexp = "ayhem .?.?"

evaluation = [(re.match(regexp, "ayhem " + random_string(2)) is not None) for _ in range(50)]
print(np.array(evaluation).all()) # this evaluates to True

True


[reference](https://hyperskill.org/learn/step/9468)

## 1.3 Escaping Characters
### 1.3.1 Backslashes
Sometimes we want to include the special characters in the regular expression as their actual literal meaning. We can use backslashes for this purpose. "\".

In [5]:
espaced_point = "\."
print(re.match(espaced_point, ".") is not None)
print(re.match(espaced_point, "a") is not None)

escaped_question_mark = "\?"
print(re.match(escaped_question_mark, "") is not None) # false
print(re.match(escaped_question_mark, "?") is not None) # True

True
False
False
True


Since having numerous "\" can easily become so confusing, Python included two additional powerful features:
1. the r prefix
2. re.escape
### 1.3.2 r prefix
r'string' will simply ask Python to treat each "\" literally. Thus the escape will be applied only to special characters of regex:
* r'\t' is indeed '\t' not the tabulation symbol
* r'\\.' denotes a usual dot and it is equivalent to '\\.'
### 1.3.3 re.escape
re.escape(string) will escape every special character and treat it literally

In [6]:
a = re.escape("sh.t")
print(re.match(a, "shit") is not None)

False


[!reference](https://hyperskill.org/learn/step/9754) 

## 1.4 Regexp Sets and Ranges 
sets are written between two brackets \[symbols\]: represents any character that belongs to that set.

In [7]:
template = r'[123][456][789]'
print(re.match(template, "147" ) is not None)
print(re.match(template, "159" ) is not None)
print(re.match(template, "169" ) is not None)
print(re.match(template, "139" ) is not None)

True
True
True
False


### 1.4.1 Escaping in sets
Sets treat most of the special characters in their literal meaning with the exception of
* \: backslash
* ]
### 1.4.2 Ranges
we can set a range of acceptable values inside a set:

In [8]:
r1 = "[0-9]"
random_num = "".join(e for e in random.sample(string.digits, random.randint(1, 10)))
print(random_num)
print(re.match(r1, random_num) is not None)
r2 = "[0-9][a-z]"
print(re.match(r2, "0t") is not None)
print(re.match(r2, "000000z")is not None)

9014672
True
True
False


### 1.4.2 Exclusion of sets
The same range can be used to exclude a range of values, just add the symbol "^" before the range in the \[\]

In [9]:
text = "Ayhem is a great student. Ayhem is hungry for success and unexpectedly pussy. Ayhem is suffering from both poverty and drought " + \
    "so don't be like Ayhem unless you 're really mentally tought enough!!"
re.search("[aA]yhem", text) 
re.findall("[a-t]{5}", text)
re.split("[a-t]{5}", text)
re.findall("[A-Z]{1}[a-z]+", text)

grades = ['A', 'B', 'C', 'D']
print(grades)
random_grades = random_string(str_len=random.randint(2, 15), characters=grades)
# try to detect a decreasing trend in the grades
print(random_grades)
re.findall("[A]*[B]*[C]*[D]*", random_grades)

['A', 'B', 'C', 'D']
DBDCACD


['D', 'BD', 'C', 'ACD', '']

In [10]:
a1 = np.random.rand(4)
a2 = np.random.rand(4, 1)
a3 = np.array([[1, 2, 3, 4]])
a4 = np.arange(1, 4, 1)
a5 = np.linspace(1 ,4, 4)
a = [a1, a2, a3, a4, a5]
for ax in a:
    print(ax.shape)

(4,)
(4, 1)
(1, 4)
(3,)
(4,)


In [11]:
r = np.random.rand(6,6)
print(r)

[[0.48574089 0.10345161 0.02662984 0.72234754 0.63411376 0.62489732]
 [0.98237043 0.08534778 0.05123036 0.70734413 0.83150791 0.05454003]
 [0.0958706  0.26541723 0.2202003  0.99210816 0.46997993 0.95357357]
 [0.62524305 0.0431272  0.50776207 0.3745882  0.98848912 0.44540114]
 [0.29521422 0.13368912 0.51282807 0.36253909 0.95275568 0.30383428]
 [0.91617947 0.27822312 0.77167756 0.43717709 0.11708864 0.50213237]]


In [12]:
print(r[2:4, 2:4], "\n")
print(r[[2,3], [2,3]])

[[0.2202003  0.99210816]
 [0.50776207 0.3745882 ]] 

[0.2202003 0.3745882]


In [13]:
s = 'ACAABAACAAAB'
result = re.findall('A{1,2}', s)
len(result)

5

In [14]:
with open("utility_files/grades.txt") as file:
    grades = file.read()
    # print(grades)
    template = "([A-Za-z]+ [A-Za-z]+)(: B)"
    # print(re.findall(template, grades))
    for item in re.finditer(template, grades):
        print(item.groups()[0])

Bell Kassulke
Simon Loidl
Elias Jovanovic
Hakim Botros
Emilie Lorentsen
Jake Wood
Fatemeh Akhtar
Kim Weston
Yasmin Dar
Viswamitra Upandhye
Killian Kaufman
Elwood Page
Elodie Booker
Adnan Chen
Hank Spinka
Hannah Bayer


In [15]:
with open ("utility_files/logdata.txt", "r") as file:
    content = file.read()
    # print(content)
    template_host = "(\d+\.){3}\d+"
    print(re.match(template_host, "12.3.5.345") is not None)
    template_space = "( - )"
    template_user = "[\w-]+"
    template_date = " \[\d{2}\/[A-Za-z]{3}\/\d{4}:\d{2}:\d{2}:\d{2} -\d{4}\] "
    template_request = '"[A-Z]+ .+"'
    final_template = template_host.join(["(", ")"]) + template_space + template_user.join(["(", ")"]) + template_date.join(["(", ")"]) + template_request.join(["(", ")"])
    # print(final_template)
    re.findall(template_host, content)
    
    line = '231.220.8.214 - - [21/Jun/2019:15:45:52 -0700] "HEAD /systems/sexy HTTP/1.1" 201 2578'
    res = [item for item in re.finditer(final_template, line)]
    i = 0
    while True:
       try:
            print("group(" + str(i)+ ")" + str(res[0].group(i)))
       except: 
            break
       i += 1
    result = [{"host": item.group(1), "user_name": item.group(4), "time": item.group(5)[2:-2], "request": item.group(6)[1:-1]} for item in re.finditer(final_template, content)]
    print(result[0:20])   

True
group(0)231.220.8.214 - - [21/Jun/2019:15:45:52 -0700] "HEAD /systems/sexy HTTP/1.1"
group(1)231.220.8.214
group(2)8.
group(3) - 
group(4)-
group(5) [21/Jun/2019:15:45:52 -0700] 
group(6)"HEAD /systems/sexy HTTP/1.1"
[{'host': '146.204.224.152', 'user_name': 'feest6811', 'time': '21/Jun/2019:15:45:24 -0700', 'request': 'POST /incentivize HTTP/1.1'}, {'host': '197.109.77.178', 'user_name': 'kertzmann3129', 'time': '21/Jun/2019:15:45:25 -0700', 'request': 'DELETE /virtual/solutions/target/web+services HTTP/2.0'}, {'host': '156.127.178.177', 'user_name': 'okuneva5222', 'time': '21/Jun/2019:15:45:27 -0700', 'request': 'DELETE /interactive/transparent/niches/revolutionize HTTP/1.1'}, {'host': '100.32.205.59', 'user_name': 'ortiz8891', 'time': '21/Jun/2019:15:45:28 -0700', 'request': 'PATCH /architectures HTTP/1.0'}, {'host': '168.95.156.240', 'user_name': 'stark2413', 'time': '21/Jun/2019:15:45:31 -0700', 'request': 'GET /engage HTTP/2.0'}, {'host': '71.172.239.195', 'user_name': 'dool