In [6]:
import re
import numpy as np

In [7]:
string = 'bat, lat, mat, bet, let, met, bit, lit, mit, bot, lot, mot'
re.findall('b[ao]t', string)

['bat', 'bot']

### ***L2 distance:***

# ***$L_{2} = \sqrt{\sum^n_{i = 0}{(a_{i} - b_{i})^{2}}}$***

In [22]:
def l2_dist(array_one: np.array, array_two: np.array) -> np.float64:
    if len(array_one) != len(array_two):
        raise IndexError("Incompatible array shapes: Both arrays must be of the same dimension!\n\
Shape of the first array is {} while that of the second is {}!".format(array_one.shape, array_two.shape))
        return None
    else:
        return np.sqrt(np.square(array_one - array_two).sum())

In [23]:
l2_dist(np.arange(0, 100), np.arange(1100, 12540))

IndexError: Incompatible array shapes: Both arrays must be of the same dimension!
Shape of the first array is (100,) while that of the second is (11440,)!

In [25]:
array = np.zeros([2, 10])
array

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [27]:
array.reshape([5 , 4])

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [28]:
np.reshape(array, (10 * 2))

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [30]:
array.T

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

In [31]:
np.random.seed(2023 - 1 - 14)

In [33]:
a = np.random.rand(20, 20)
b = np.random.rand(20, 20)

In [35]:
l2_dist(a, b)

8.413568707395731

In [36]:
l2_dist(np.reshape(a, 400), np.reshape(b, 400))

8.413568707395731

In [37]:
l2_dist(a.T, b.T)

8.413568707395731

In [38]:
l2_dist(np.reshape(a, (20 * 20)), np.reshape(b, (20 * 20, 1)))

159.09332557698798

In [40]:
np.reshape(a, (400, 1));

In [41]:
a1 = np.random.rand(4)
a2 = np.random.rand(4, 1)
a3 = np.array([[1, 2, 3, 4]])
a4 = np.arange(1, 4, 1)
a5 = np.linspace(1 ,4, 4)

In [43]:
a1

array([0.37216463, 0.60940325, 0.94514346, 0.93314268])

In [44]:
a2

array([[0.23628099],
       [0.58587651],
       [0.42531531],
       [0.8541892 ]])

In [45]:
a1.shape 

(4,)

In [46]:
a2.shape

(4, 1)

In [47]:
a5

array([1., 2., 3., 4.])

In [48]:
old = np.array([[1, 1, 1], [1, 1, 1]])
new = old
new[0, :2] = 0

print(old)

[[0 0 1]
 [1 1 1]]


In [51]:
re.findall("^AC", "ACBBAAACB")

['AC']

In [52]:
s = 'ACAABAACAAAB'
re.findall('A{1,2}', s)

['A', 'AA', 'AA', 'AA', 'A']

In [53]:
text = "Office of Research Administration: (734) 647-6333 | 4325 North Quad\
Office of Budget and Financial Administration: (734) 647-8044 | 309 Maynard, Suite 205\
Health Informatics Program: (734) 763-2285 | 333 Maynard, Suite 500\
Office of the Dean: (734) 647-3576 | 4322 North Quad\
UMSI Engagement Center: (734) 763-1251 | 777 North University\
Faculty Adminstrative Support Staff: (734) 764-9376 | 4322 North Quad"

In [54]:
re.findall("[(][0-9]{3}[)]\s[0-9]{3}-[0-9]{4}", text)

['(734) 647-6333',
 '(734) 647-8044',
 '(734) 763-2285',
 '(734) 647-3576',
 '(734) 763-1251',
 '(734) 764-9376']

In [56]:
re.findall("[(]\d{3}[)]\s\d{3}-\d{4}", text)

['(734) 647-6333',
 '(734) 647-8044',
 '(734) 763-2285',
 '(734) 647-3576',
 '(734) 763-1251',
 '(734) 764-9376']

In [67]:
text = "I refer to https://google.com and I never refer https://www.baidu.com if I have to search anything"

In [68]:
re.findall("(?<=https:\/\/)([A-Za-z0-9]*)", text)

['google', 'www']

In [69]:
re.findall("(?<=https:\/\/)([A-Za-z0-9.]*)", text)

['google.com', 'www.baidu.com']

In [70]:
text=r'''Everyone has the following fundamental freedoms:
    (a) freedom of conscience and religion;
    (b) freedom of thought, belief, opinion and expression, including freedom of the press and other media of communication;
    (c) freedom of peaceful assembly; and
    (d) freedom of association.'''

In [72]:
re.findall("[a-d]", text);

In [73]:
re.findall("[(][a-d][)]", text)

['(a)', '(b)', '(c)', '(d)']

In [74]:
re.findall("\(.\)", text)

['(a)', '(b)', '(c)', '(d)']

In [75]:
def names():
    simple_string = """Amy is 5 years old, and her sister Mary is 2 years old. 
    Ruth and Peter, their parents, have 3 kids."""

    # YOUR CODE HERE
    return [name for name in simple_string.replace('"""',"").split(" ") if re.match("$[A-Z]\w+", name)]

names()

[]

In [76]:
simple_string = """Amy is 5 years old, and her sister Mary is 2 years old. Ruth and Peter, their parents, have 3 kids."""

In [78]:
re.findall("[A-Z]\w+[\s{1}\,]", simple_string)

['Amy ', 'Mary ', 'Ruth ', 'Peter,']

In [82]:
[word for word in simple_string.split(" ") if re.match("^[A-Z]\w+", word)]

['Amy', 'Mary', 'Ruth', 'Peter,']

In [83]:
with open(r"D:\Introduction-to-Data-Science-in-Python\assignment1\assets\grades.txt", "r") as file:
    grades = file.read()

In [86]:
grades = grades.split("\n")

In [88]:
grades[:3]

['Ronald Mayr: A', 'Bell Kassulke: B', 'Jacqueline Rupp: A ']

In [100]:
# students who received a B
[student for student in grades if re.match("^\w+\s\w+:\sB", student)]

['Bell Kassulke: B',
 'Simon Loidl: B ',
 'Elias Jovanovic: B ',
 'Hakim Botros: B',
 'Emilie Lorentsen: B',
 'Jake Wood: B',
 'Fatemeh Akhtar: B',
 'Kim Weston: B',
 'Yasmin Dar: B',
 'Viswamitra Upandhye: B',
 'Killian Kaufman: B',
 'Elwood Page: B',
 'Elodie Booker: B',
 'Adnan Chen: B',
 'Hank Spinka: B',
 'Hannah Bayer: B']

In [101]:
len([student for student in grades if re.match("^\w+\s\w+:\sB", student)])

16

In [102]:
# 16 is expected

[student for student in grades if ": B" in student]

['Bell Kassulke: B',
 'Simon Loidl: B ',
 'Elias Jovanovic: B ',
 'Hakim Botros: B',
 'Emilie Lorentsen: B',
 'Jake Wood: B',
 'Fatemeh Akhtar: B',
 'Kim Weston: B',
 'Yasmin Dar: B',
 'Viswamitra Upandhye: B',
 'Killian Kaufman: B',
 'Elwood Page: B',
 'Elodie Booker: B',
 'Adnan Chen: B',
 'Hank Spinka: B',
 'Hannah Bayer: B']

In [103]:
len([student for student in grades if ": B" in student])

16

In [104]:
with open(r"D:/Introduction-to-Data-Science-in-Python/assignment1/assets/logdata.txt", "r") as file:
    logs = file.read()

In [106]:
logs = logs.split("\n")
logs[:5]

['146.204.224.152 - feest6811 [21/Jun/2019:15:45:24 -0700] "POST /incentivize HTTP/1.1" 302 4622',
 '197.109.77.178 - kertzmann3129 [21/Jun/2019:15:45:25 -0700] "DELETE /virtual/solutions/target/web+services HTTP/2.0" 203 26554',
 '156.127.178.177 - okuneva5222 [21/Jun/2019:15:45:27 -0700] "DELETE /interactive/transparent/niches/revolutionize HTTP/1.1" 416 14701',
 '100.32.205.59 - ortiz8891 [21/Jun/2019:15:45:28 -0700] "PATCH /architectures HTTP/1.0" 204 6048',
 '168.95.156.240 - stark2413 [21/Jun/2019:15:45:31 -0700] "GET /engage HTTP/2.0" 201 9645']

In [135]:
len(logs)

980

#### ***Template***
***{'host': '146.204.224.152',      
&emsp;  'user_name': 'feest6811',          
&emsp;&emsp;  'time': '21/Jun/2019:15:45:24 -0700',          
&emsp;&emsp;&emsp;  'request': 'POST /incentivize HTTP/1.1'}***

In [198]:
logs[0]

'146.204.224.152 - feest6811 [21/Jun/2019:15:45:24 -0700] "POST /incentivize HTTP/1.1" 302 4622'

In [205]:
[log.split(" ")[0] for log in logs][:10]

['146.204.224.152',
 '197.109.77.178',
 '156.127.178.177',
 '100.32.205.59',
 '168.95.156.240',
 '71.172.239.195',
 '180.95.121.94',
 '144.23.247.108',
 '2.179.103.97',
 '241.114.184.133']

In [139]:
log = '146.204.224.152 - feest6811 [21/Jun/2019:15:45:24 -0700] "POST /incentivize HTTP/1.1" 302 4622'

In [185]:
ip_reg = "\d+\.\d+\.\d+\.\d+[^\s]"
re.findall(ip_reg, log)[0]

'146.204.224.152'

In [146]:
user_reg = "-\s[a-zA-Z0-9]+\s"
re.findall(user_reg, log)[0].replace(" ", "").replace("-", "")

'feest6811'

In [168]:
#             [21  /Jun         /2019  :15    :45    :24     -0700]
time_reg = "\d{2}\/[a-zA-Z]{3}\/\d{4}\:\d{2}\:\d{2}\:\d{2}\s-\d{4}"
re.findall(time_reg, log)[0]

'21/Jun/2019:15:45:24 -0700'

In [180]:
#                "POST       /incentivize HTTP/1.1"
request_reg = "[A-Z]+\s\/[^\"]+"
re.findall(request_reg, log)[0]

'POST /incentivize HTTP/1.1'

In [207]:
def serializeLogToDict(log: str) -> dict:
    try:
        return {
        "host": re.findall(ip_reg, log)[0],
        "user_name": re.findall(user_reg, log)[0].replace(" ", "").replace("-", ""),
        "time": re.findall(time_reg, log)[0],
        "request": re.findall(request_reg, log)[0]
        }
    except IndexError:
        pass

In [208]:
serialized_logs = list()
skipped_logs = 0

len(list(map(serializeLogToDict, logs)))

980

In [209]:
len(logs)

980

In [210]:
serialized_logs = list(map(serializeLogToDict, logs))

In [211]:
serialized_logs

[{'host': '146.204.224.152',
  'user_name': 'feest6811',
  'time': '21/Jun/2019:15:45:24 -0700',
  'request': 'POST /incentivize HTTP/1.1'},
 {'host': '197.109.77.178',
  'user_name': 'kertzmann3129',
  'time': '21/Jun/2019:15:45:25 -0700',
  'request': 'DELETE /virtual/solutions/target/web+services HTTP/2.0'},
 {'host': '156.127.178.177',
  'user_name': 'okuneva5222',
  'time': '21/Jun/2019:15:45:27 -0700',
  'request': 'DELETE /interactive/transparent/niches/revolutionize HTTP/1.1'},
 {'host': '100.32.205.59',
  'user_name': 'ortiz8891',
  'time': '21/Jun/2019:15:45:28 -0700',
  'request': 'PATCH /architectures HTTP/1.0'},
 {'host': '168.95.156.240',
  'user_name': 'stark2413',
  'time': '21/Jun/2019:15:45:31 -0700',
  'request': 'GET /engage HTTP/2.0'},
 {'host': '71.172.239.195',
  'user_name': 'dooley1853',
  'time': '21/Jun/2019:15:45:32 -0700',
  'request': 'PUT /cutting-edge HTTP/2.0'},
 {'host': '180.95.121.94',
  'user_name': 'mohr6893',
  'time': '21/Jun/2019:15:45:34 -0700'