# Focused Web crawling and anchor text extraction

The task is to build a simple focused web crawler that crawls pages from a given website (i.e., without leaving a given domain) and extracts anchor text from each of the pages discovered.

In [11]:
DOMAIN = "wikipedia.org"  # this is the site we want to crawl; we don't follow links outside this domain
START = "https://en.wikipedia.org/wiki/Stavanger"  # we start crawling from this URL

### This utility function crawls a given URL and extracts all links from it.

In [12]:
from bs4 import BeautifulSoup
import urllib.request

In [13]:
def extract_links(url):
    resp = urllib.request.urlopen(url)
    soup = BeautifulSoup(resp, "lxml", from_encoding=resp.info().get_param('charset'))
    links = []
    for link in soup.find_all('a', href=True):
        links.append((link['href'], link.text))
        #print(link['href'], link.text)
    return links

## Crawler main

In [14]:
import time
from urllib.parse import urljoin

In [15]:
def crawl(start_url, domain):
    visited = set()  # set of URLs visited so far
    atext = {}  # anchor text extracted for URLs, i.e., the key is the URL the value is a list of anchor texts
    queue = [start_url]
    
    while len(queue) > 0:
        url = queue.pop(0)
        print("Crawling {} ...".format(url))
        links = extract_links(url)
        # TODO for each link
        # - add anchor text to the page that is pointed by the link
        # - add link to queue if 
        #   - (i) it is within the target domain 
        #   - (ii) it hasn't been visited yet
        visited.add(url)
        for link_url, link_text in links:
            if link_url.startswith("/"):
                print("Relativ link: ", link_url)
                
                # Fikser relativ link
                link_url = urljoin(url, link_url)
                print("=>", link_url)
            elif link_url.startswith("http"):
                print("Absolute link: ", link_url)
            else:
                pass
            
            atext = {link_url:link_text}
            
            if url not in visited and domain==Domain:
                queue.append[atext]
            
            
            
        
        
        time.sleep(1)  # wait 1 sec before moving to next page

### Start crawling

In [16]:
crawl(START, DOMAIN)

Crawling https://en.wikipedia.org/wiki/Stavanger ...
Relativ link:  /wiki/File:Stavangercollage01.jpg
=> https://en.wikipedia.org/wiki/File:Stavangercollage01.jpg
Relativ link:  /wiki/File:Stavanger_komm.svg
=> https://en.wikipedia.org/wiki/File:Stavanger_komm.svg
Relativ link:  /wiki/File:Norway_Rogaland_location_map.svg
=> https://en.wikipedia.org/wiki/File:Norway_Rogaland_location_map.svg
Relativ link:  /wiki/File:Norway_location_map.svg
=> https://en.wikipedia.org/wiki/File:Norway_location_map.svg
Relativ link:  //tools.wmflabs.org/geohack/geohack.php?pagename=Stavanger&params=58_57_48_N_5_43_8_E_region:NO_type:city(130426)
=> https://tools.wmflabs.org/geohack/geohack.php?pagename=Stavanger&params=58_57_48_N_5_43_8_E_region:NO_type:city(130426)
Relativ link:  /wiki/Geographic_coordinate_system
=> https://en.wikipedia.org/wiki/Geographic_coordinate_system
Relativ link:  //tools.wmflabs.org/geohack/geohack.php?pagename=Stavanger&params=58_57_48_N_5_43_8_E_region:NO_type:city(130426)
