-
Notifications
You must be signed in to change notification settings - Fork 0
/
iit_scrapper.py
85 lines (70 loc) · 2.57 KB
/
iit_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup
import re
# REGEX of EMAILS after analysing data
regex = r"^[\w-]{1,20}(@|\[AT\]|\[AT\*\]|\(AT\)|\sAT\s)[\w]{2,20}(\.|\[DOT\]|\(DOT\)|\sDOT\s)([a-z]{2,8})((\.|\[DOT\]|\(DOT\)|\sDOT\s)[a-z]{2,8})?$"
links = set()
def go_to_links_in_links():
for link in links:
goCrawler = pyCrawler(link)
def checker(s):
s = s.replace(" at ","@")
s = s.replace("(at)","@")
s = s.replace("[at]","@")
s = s.replace("{at}","@")
s = s.replace(" dot ",".")
s = s.replace("(dot)",".")
s = s.replace("[dot]",".")
s = s.replace("[at*]","@")
s = s.replace("[dot*]",".")
return s
class pyCrawler(object):
def __init__(self,starting_url):
url=starting_url
html = uReq(url).read()
soup = BeautifulSoup(html,"html.parser")
# for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
# checkedLink = link.get('href')
# links.add(checkedLink)
# refining the text
for script in soup(["script", "style"]):
script.extract()
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
# print(text)
# Splitting Text and storing in a list
temp_text = text.splitlines()
temp2_text = ", ".join(temp_text)
l = temp2_text.split(", ")
# print(l)
data=[]
# Checking if there are emails on the page and correcting them
for tex in l :
tex = tex.lower()
tex = checker(tex)
# print(tex)
mail=re.match(regex,tex)
# print(mail)
if(mail):
mail2 = mail.group()
corrected_mail = checker(mail2)
data.append(corrected_mail)
# print(data)
# Writing in a file
global i
if(data):
filename = "emails_on_page"+".csv"
f = open(filename,"w")
f.write("Email id\n" )
for mail in data :
f.write(mail+"\n")
i+=1
print("Task Completed")
# go_to_links_in_links()
# Accesing the html on the given URL or html
url = input("Give the starting url : ")
i=1
startCrawler = pyCrawler(url)
# print(links)