# Objective #

Our objective here is to build a crawler that can take for example take Eric Idle wikipedia page and find the fewest number of links to reach the Kevin Bacon page on wikipedia.

I will be exploring a few examples in the process. 

Python has a recursive cycle limit of 1000 cycles. I will be putting in hard limits to contain the recursive process.

### 1: Find a list of all the article URLs that the Wikipedia article Kevin Bacon links to. And then selecting a URL from that list and searching again until we reach a page with no further links. ##

In [3]:
from urllib2 import urlopen
from bs4 import BeautifulSoup
import datetime
import random
import re

random.seed(datetime.datetime.now())


def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org" + articleUrl)
    bsObj = BeautifulSoup(html, "html.parser")
    '''
    Finding links that point to other article pages:
    - They reside in the 'div' with 'id' set to 'bodyContent'
    - URLs do not have colons
    - URLs begin with '/wiki/'
    '''
    # Find the links that are linked in the Kevin Bacon page
    return bsObj.find("div", {"id":"bodyContent"}).findAll("a", href = re.compile("^(/wiki/)((?!:).)*$"))

# Kevin Bacon page
links = getLinks("/wiki/Kevin_Bacon")

count = 0
while len(links) > 0:
    # For time saving purposes
    if count == 5:
        break
    
    # Select random article from returned list,  and calls getLinks again until we stop or there are no article
    # links found on the new page.
    newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
    print(newArticle)
    # Get all the links in the new article
    links = getLinks(newArticle)
    count += 1

/wiki/Annette_Bening
/wiki/Ingrid_Bergman
/wiki/Michelle_Pfeiffer
/wiki/Susan_Sarandon
/wiki/Diane_Keaton


### 2. Crawl through a page and find the internal links. Crawl through those links and continue the process. Ensure that the same internal links are on different pages are not crawled repeatedly. ###

In [12]:
from urllib2 import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    count = 0
    global pages
    
    html = urlopen("http://en.wikipedia.org" + pageUrl)
    bsObj = BeautifulSoup(html, "html.parser")
    
    # Find the links on the page
    for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # We have encountered a new page
                newPage = link.attrs['href']
                print("----------------\n" + newPage)
                # Add page to our set.
                pages.add(newPage)
                # Recursively find the new links in that internal page.
                getLinks(newPage)
            count += 1
        # Recursive limit
        if count >= 1:
            return
    

getLinks("/wiki/Kevin_Bacon") 

----------------
/wiki/Wikipedia:Protection_policy#semi


### 3. Building a scraper that collects the title, the first paragraph of content, and the link to edit the page. ###

In [15]:
'''
- Titles are under the h1 -> span 
- First paragraph is under div#mw-content-text -> p
- Edit links are only on the article pages, they are under li#ca-edit -> span -> a
'''

from urllib2 import urlopen
from bs4 import BeautifulSoup
import re

pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bsObj = BeautifulSoup(html, "html.parser")
    
    try:
        # Get title
        print(bsObj.h1.get_text())
        # Get first paragraph
        print(bsObj.find(id ="mw-content-text").findAll("p")[0])
        # Get edit link
        print(bsObj.find(id="ca-edit").find("span").find("a").attrs['href'])
    except AttributeError:
        print("This page is missing something! No worries though!")
    
    # Get the links in the internal page
    for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                # We have encountered a new page
                newPage = link.attrs['href']
                print("----------------\n"+newPage)
                # Add it to our set
                pages.add(newPage)
                # Recursively call in the internal links
                getLinks(newPage)
            # Recursive limit
        if count >= 1:
            return
            
getLinks("/wiki/Kevin_Bacon") 

Kevin Bacon
<p><b>Kevin Norwood Bacon</b><sup class="reference" id="cite_ref-1"><a href="#cite_note-1">[1]</a></sup> (born July 8, 1958)<sup class="reference" id="cite_ref-actor_2-0"><a href="#cite_note-actor-2">[2]</a></sup> is an American actor and musician whose films include musical-drama film <i><a href="/wiki/Footloose_(1984_film)" title="Footloose (1984 film)">Footloose</a></i> (1984), the controversial historical conspiracy legal thriller <i><a href="/wiki/JFK_(film)" title="JFK (film)">JFK</a></i> (1991), the legal drama <i><a href="/wiki/A_Few_Good_Men" title="A Few Good Men">A Few Good Men</a></i> (1992), the historical docudrama <i><a href="/wiki/Apollo_13_(film)" title="Apollo 13 (film)">Apollo 13</a></i> (1995), and the mystery drama <i><a href="/wiki/Mystic_River_(film)" title="Mystic River (film)">Mystic River</a></i> (2003). Also on television, he starred in the <a href="/wiki/Fox_Broadcasting_Company" title="Fox Broadcasting Company">Fox</a> series <i><a href="/wiki/T

### 4. Crawling across the internet ###

Steps followed are as follows:

* Retrieve a list of all Internal links found on a page.
* Retrieves a list of all external links found on a page.
* Pick a random external link.
* Follow that link and continue.

This program will start at a website and randomly from external link to external link.

In [25]:
from urllib2 import urlopen
from urlparse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

# Retrieves a list of all Internal links found on a page
def getInternalLinks(bsObj, includeUrl):
    includeUrl = urlparse(includeUrl).scheme + "://" + urlparse(includeUrl).netloc
    internalLinks = []
    
    # Finds all links that begin with a "/"
    for link in bsObj.findAll("a", href=re.compile("^(/|.*" + includeUrl + ")" )):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
            
# Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
    externalLinks = []
    # Finds all links that start with "http" or "www" that do
    # not contain the current URL
    for link in bsObj.findAll("a", 
                              href=re.compile("^(http|www)((?!"+ excludeUrl + ").)*$")):
    
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

# Pick random external link
def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html, "html.parser")
    # Get the external links
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    
    # Pick a random external link
    
    # If there are no external links, go into the internal links, then select an external link from one of those.
    if len(externalLinks) == 0:
        print("No external links, looking around the site for one")
        domain = urlparse(startingPage).scheme + "://" + urlparse(startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        
        # Pick random external link from internal link
        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]
    
def followExternalOnly(startingSite):
    count = 1
    # Get the external link to follow
    externalLink = getRandomExternalLink(startingSite)
    print("Random external link is: "+ externalLink)
    # Recursive call
    followExternalOnly(externalLink)
#     count += 1
#     if count == 3:
#         return
    

followExternalOnly("http://adarsh-nair.appspot.com")

Random external link is: https://giphy.com/gifs/thehills-the-hills-1x05-105-xT5LMWGwEPNVep0iBO
Random external link is: http://www.mtv.com/shows/the-hills/episode-guide
Random external link is: https://vine.co/MTV
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, looking around the site for one
No external links, lo

KeyboardInterrupt: 

### 5. Get a list of all external and internal links on a webpage. ###

In [29]:
from urllib2 import urlopen
from urlparse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random

pages = set()
random.seed(datetime.datetime.now())

'''
Same as part 4.
'''
#Retrieves a list of all Internal links found on a page
def getInternalLinks(bsObj, includeUrl):
    includeUrl = urlparse(includeUrl).scheme+"://"+urlparse(includeUrl).netloc
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bsObj.findAll("a", href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith("/")):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
            
#Retrieves a list of all external links found on a page
def getExternalLinks(bsObj, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" or "www" that do
    #not contain the current URL
    for link in bsObj.findAll("a", href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks


def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bsObj = BeautifulSoup(html, "html.parser")
    externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print("No external links, looking around the site for one")
        domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
        internalLinks = getInternalLinks(bsObj, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]
    
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print("Random external link is: "+externalLink)
    followExternalOnly(externalLink)
            
'''
End of code that is same as part 4.
'''        
        
      
# Collects a list of all external URLs found on the site and makes a note of it
allExtLinks = set()
allIntLinks = set()

def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = urlparse(siteUrl).scheme + "://" + urlparse(siteUrl).netloc
    bsObj = BeautifulSoup(html, "html.parser")
    
    # Get list of internal and external links
    internalLinks = getInternalLinks(bsObj,domain)
    externalLinks = getExternalLinks(bsObj,domain)

    # Print external links
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    
    # Print internal links
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)


# Pick random external link
followExternalOnly("http://google.com")

# Add internal links found
allIntLinks.add("http://google.com")

# Print all external links
getAllExternalLinks("http://google.com")

Random external link is: http://www.youtube.com/?tab=w1
Random external link is: http://bit.ly/RL_Store
Random external link is: https://www.facebook.com/DFTBArecords


KeyboardInterrupt: 