In [None]:
# Writing Web Crawlers
import requests
from bs4 import BeautifulSoup
import re

url = 'http://en.wikipedia.org/wiki/Kevin_Bacon'

try:
    r = requests.get(url)
    r.raise_for_status()
except requests.exceptions.HTTPError as errh:
    print("Http Error:", errh)
except requests.exceptions.ConnectionError as errc:
    print("Error Connecting:", errc)
except requests.exceptions.Timeout as errt:
    print("Timeout Error:", errt)
except requests.exceptions.RequestException as err:
    print("OOps: Something Else", err)
    

html = r.text
soup = BeautifulSoup(html, 'html.parser')

In [None]:
# attrs : to extract all the tag attribute

list_of_links = soup.find_all('a')
for link in list_of_links:
    if 'href' in link.attrs:
        print(link.attrs['href'])
        print(link.attrs, end='\n\n')


In [None]:
for link in soup.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
        print(link.attrs['href'])

In [None]:
for link in soup.find('div', {'id':'bodyContent'}).find_all('a', {'href' : re.compile('^(/wiki/)((?!:).)*$')}):
    if 'href' in link.attrs:
        print(link.attrs['href'])

<h4><span style="font-family:courier">the program defines the getLinks function, which takes in an article URL of
the form /wiki/... , prepends the Wikipedia domain name, http://en.wikipe
dia.org , and retrieves the BeautifulSoup object for the HTML at that domain. It
then extracts a list of article link tags, based on the parameters discussed previously,
and returns them.
The main body of the program begins with setting a list of article link tags (the
links variable) to the list of links in the initial page: https://en.wikipedia.org/wiki/
36
|
Chapter 3: Writing Web CrawlersKevin_Bacon. It then goes into a loop, finding a random article link tag in the page,
extracting the href attribute from it, printing the page, and getting a new list of links
from the extracted URL.
Of course, there’s a bit more to solving a Six Degrees of Wikipedia problem than
building a scraper that goes from page to page. You must also be able to store and
analyze the resulting data.</span><h4>

In [None]:
#from IPython.core.debugger import set_trace

random.seed(datetime.datetime.now())

def getLinks(articleUrl):
    r = requests.get('http://en.wikipedia.org{}'.format(articleUrl))
    html = r.text
    bs = BeautifulSoup(html, 'html.parser')
    result = bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$'))
    return result

links = getLinks('/wiki/Kevin_Bacon')


while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

In [None]:
pages = set()

def getLinks(pageUrl):
    global pages
    r = requests.get('http://en.wikipedia.org{}'.format(pageUrl))
    html = r.text
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)

getLinks('')

In [None]:
pages = set()


def getLinks(pageUrl):
    global pages
    r = requests.get('http://en.wikipedia.org{}'.format(pageUrl))
    html = r.text
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        print(bs.find(id='ca-edit').find('span').find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
        
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
    

getLinks('')

In [None]:
import requests
from bs4 import BeautifulSoup


url = 'http://en.wikipedia.org/wiki/Kevin_Bacon'

r = requests.get(url, allow_redirects=True)
soup = BeautifulSoup(r.content)

for a_href in soup.find_all('a', href=True):
    print(a_href['href'])

In [None]:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())


#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme, urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a', href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks


#Retrieves a list of all external links found on a page
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" that do
    #not contain the current URL
    for link in bs.find_all('a', href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks


def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme, urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        return getRandomExternalLink(internalLinks[random.randint(0, len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]


def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink)

followExternalOnly('http://oreilly.com')

In [None]:
# Collects a list of all external URLs found on the site
allExtLinks = set()
allIntLinks = set()


def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
    urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)
allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')