/
main.py
63 lines (53 loc) · 1.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import re
import urllib.robotparser
import colorama
from colorama import Fore, Back, Style
colorama.init()
class Robots:
def __init__(self, url, user_agent):
self.raw_url = url
if user_agent == '':
self.user_agent = '*'
else:
self.user_agent = user_agent
self.root_url = self._get_root_url()
self.robots_url = self._get_robots_txt_path()
self.rp = self._get_robots_txt()
def _get_root_url(self):
pattern = r'(?P<root>https?://.*?)\/.*'
result = re.match(pattern, self.raw_url)
if result is not None:
return result.group('root')
def _get_robots_txt_path(self):
return self.root_url + '/robots.txt'
def _get_robots_txt(self):
rp = urllib.robotparser.RobotFileParser()
rp.set_url(self.robots_url)
rp.read()
return rp
def url_can_fetch(self) -> str:
if self.rp.can_fetch(self.user_agent, self.raw_url) is True:
return Back.BLUE + 'Allow' + Style.RESET_ALL
else:
return Back.RED + 'Disallow' + Style.RESET_ALL
def url_crawl_delay(self) -> str:
if self.rp.crawl_delay(self.user_agent) is not None:
return Back.MAGENTA + str(self.rp.crawl_delay(self.user_agent)) + 's' + Style.RESET_ALL
else:
return Back.MAGENTA + 'None' + Style.RESET_ALL + '(1s wait recommended)'
def main():
url = input('URL Here >>> ')
user_agent = input('User-Agent Here >>> ')
robots = Robots(url, user_agent)
print(f"""
root URL -> {robots.root_url}
robots.txt URL -> {robots.robots_url}
-------------------------------------
Can fetch
user-agent: {Fore.GREEN}{robots.user_agent}{Style.RESET_ALL} -> {robots.url_can_fetch()}
-------------------------------------
Crawl delay
user-agent: {Fore.GREEN}{robots.user_agent}{Style.RESET_ALL} -> {robots.url_crawl_delay()}
""")
if __name__ == '__main__':
main()