In [11]:
# Write a web crawling program in the language of your choice. Your program must meet the
# criteria described below. Also, write a report of up to half a page describing your crawling
# approach. What are your crawler's strengths and vulnerabilities? Would you use this program in a
# production setting? What problems would it face, and how could it be improved? While you don't
# need to go "above and beyond" the criteria below, you are encouraged to think about what that
# might mean.
# Program Specification
# 1. Start at the URL http://www.mit.edu and visit only web pages in the mit.edu domain.
# 2. Stop once you have visited 100 HTML pages.
# 3. Detect whether a page you visit is an HTML document, a PDF, or something else using the
# appropriate Content-Type HTTP header.
# 4. Detect all outgoing links on each HTML page you visit, and follow only links to pages
# you have not yet visited. This will require you to convert some links to a canonical form,
# e.g. http://somepage.com/my_page.html# should be converted to
# http://somepage.com/my_page.html.
# 5. For every web page you visit, write its canonical URL and the canonical URLs of each
# outgoing link to an HTML or PDF file, separated by a single space, to a plain text output
# file, with one line per visited page.
# 6. Put some thought into the pages you decide to visit, and the order in which you visit them.
# Is there any way links on web pages could cause your crawler to misbehave? Remember
# that a crawler on the open web will face many novice and malicious web developers.
# 7. Your crawler must visit at most one page per five seconds. Under no circumstances, even
# during development and testing, should your crawler visit pages more frequently. This is
# necessary in order to be respectful of the limited resources of the web servers you are
# visiting. (Feel free during development to reduce the total number of pages you visit if this
# is slowing you down.)
# 8. Your crawler must respect robots.txt (https://web.mit.edu/robots.txt).

In [12]:
import requests
import re
import time
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser
from collections import deque
from urllib.parse import urldefrag
from urllib.parse import urlsplit

In [13]:
# global variables
visited = set()
queue = deque()
count = 0
domain = 'mit.edu'
rp = RobotFileParser()
rp.set_url('http://www.mit.edu/robots.txt')
rp.read()

In [14]:
# function to check if the url is valid
def is_valid(url):
    try:
        parsed = urlparse(url)
        if parsed.scheme not in set(["http", "https"]):
            return False
        if not re.match(r'.*\.mit\.edu$', parsed.netloc):
            return False
        if not rp.can_fetch("*", url):
            return False
        if parsed.query:
            return False
        if parsed.fragment:
            return False
        return True
    except TypeError:
        print ("TypeError for ", parsed)
        raise

In [15]:
# function to get the canonical url
def get_canonical_url(url):
    url = urldefrag(url)[0]
    url = urlsplit(url)
    url = url._replace(path = re.sub(r'/$', '', url.path))
    url = url._replace(path = re.sub(r'index\.html$', '', url.path))
    url = url._replace(path = re.sub(r'index\.htm$', '', url.path))
    url = url._replace(path = re.sub(r'index\.php$', '', url.path))
    url = url._replace(path = re.sub(r'index\.asp$', '', url.path))
    url = url._replace(path = re.sub(r'index\.jsp$', '', url.path))
    url = url._replace(path = re.sub(r'index\.cgi$', '', url.path))
    url = url._replace(path = re.sub(r'index\.pl$', '', url.path))
    url = url._replace(path = re.sub(r'index\.xhtml$', '', url.path))
    url = url._replace(path = re.sub(r'index\.htm$', '', url.path))
    url = url._replace(path = re.sub(r'index\.shtml$', '', url.path))
    url = url._replace(path = re.sub(r'index\.xhtm$', '', url.path))
    return url.geturl()

In [16]:
# function to get the links from the url
def get_links(url):
    global count
    global visited
    global queue
    global domain
    global rp
    try:
        response = requests.get(url)
        content_type = response.headers['content-type']
        if 'text/html' in content_type:
            soup = BeautifulSoup(response.text, 'html.parser')
            for link in soup.find_all('a'):
                link = link.get('href')
                if link is not None:
                    link = get_canonical_url(link)
                    if is_valid(link):
                        if link not in visited:
                            if domain in link:
                                visited.add(link)
                                queue.append(link)
                                count += 1
                                print (count, link)
        elif 'application/pdf' in content_type:
            print (count, url)
        else:
            print (count, url)
    except:
        print (count, url)

In [17]:
global count
global visited
global queue
global domain
global rp
url = 'http://www.mit.edu'
visited.add(url)
queue.append(url)
count += 1
print (count, url)
while count < 100 and queue:
    url = queue.popleft()
    get_links(url)
    time.sleep(5)

1 http://www.mit.edu
2 http://news.mit.edu
3 https://president.mit.edu/about-mit-president-sally-kornbluth
4 https://inauguration.mit.edu/street-fair
5 http://inauguration.mit.edu/we-are-the-forest
6 https://inauguration.mit.edu/academic-symposium
7 https://inauguration.mit.edu/inauguration-ceremony
8 https://inauguration.mit.edu/inauguration-concert
9 http://inauguration.mit.edu
10 http://news.mit.edu/news-clip/national-public-radio-npr-43
11 http://news.mit.edu/2023/speedy-robo-gripper-reflexively-organizes-spaces-0427
12 http://news.mit.edu/2023/amelia-dogan-driving-toward-data-justice-0426
13 http://whereis.mit.edu
14 http://calendar.mit.edu
15 http://careers.mit.edu
16 http://socialmediahub.mit.edu
17 http://web.mit.edu
18 http://web.mit.edu/education
19 http://web.mit.edu/research
20 http://web.mit.edu/innovation
21 http://web.mit.edu/admissions-aid
22 http://web.mit.edu/campus-life
23 http://web.mit.edu/alumni
24 http://web.mit.edu/about
25 http://web.mit.edu/search
26 http://we

In [None]:
# add the visited urls to the file
with open('q3_visited.txt', 'w') as f:
    for url in visited:
        f.write("%s\n" % url)