# Page Rank - Spidering 

In [1]:
# first we import all libraries to be used in our code
import sqlite3
import urllib.error
import ssl
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

In [3]:
# if there is no file spider.sqlite, this creates new one
conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()

In [4]:
# create three tables in your database: Pages, Links & Webs 
# in case there already exists these tables , ignore this 
cur.execute('''CREATE TABLE IF NOT EXISTS Pages
    (id INTEGER PRIMARY KEY, 
    url TEXT UNIQUE, 
    html TEXT,
    error INTEGER, 
    old_rank REAL, 
    new_rank REAL)''')

cur.execute('''CREATE TABLE IF NOT EXISTS Links
    (from_id INTEGER, 
    to_id INTEGER)''')

cur.execute('''CREATE TABLE IF NOT EXISTS Webs 
    (url TEXT UNIQUE)''')

<sqlite3.Cursor at 0x8af4fd4a40>

In [5]:
# Check to see if we are already in progress...
cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
row = cur.fetchone()

In [7]:
# if we already ran the crawl, we will restart it and update our database
# if we want to crawl different website, then we need to delete our old database and run this code again and add new web url..
if row is not None:
    print("Restarting existing crawl.  Remove spider.sqlite to start a fresh crawl.")
else :
    starturl = input('Enter web url or enter: ')
    if ( len(starturl) < 1 ) : starturl = 'http://www.dr-chuck.com/'
    if ( starturl.endswith('/') ) : starturl = starturl[:-1]
    web = starturl
    if ( starturl.endswith('.htm') or starturl.endswith('.html') ) :
        pos = starturl.rfind('/')
        web = starturl[:pos]

    if ( len(web) > 1 ) :
        cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) )
        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) )
        conn.commit()

Enter web url or enter: http://python-data.dr-chuck.net


In [8]:
# Get the current webs
cur.execute('''SELECT url FROM Webs''')
webs = list()
for row in cur:
    webs.append(str(row[0]))

print(webs)

['http://python-data.dr-chuck.net']


In [9]:
# this asks you to enter how many unretrieved pages you want to crawl and enter the new links found into pages table 
# for each page it prints out the number of new links found for that page
many = 0
while True:
    if ( many < 1 ) :
        sval = input('How many pages:')
        if ( len(sval) < 1 ) : break
        many = int(sval)
    many = many - 1

    cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
    try:
        row = cur.fetchone()
        # print row
        fromid = row[0]
        url = row[1]
    except:
        print('No unretrieved HTML pages found')
        many = 0
        break

    print(fromid, url, end=' ')

    # If we are retrieving this page, there should be no links from it
    cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) )
    try:
        document = urlopen(url, context=ctx)

        html = document.read()
        if document.getcode() != 200 :
            print("Error on page: ",document.getcode())
            cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) )

        if 'text/html' != document.info().get_content_type() :
            print("Ignore non text/html page")
            cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) )
            conn.commit()
            continue

        print('('+str(len(html))+')', end=' ')

        soup = BeautifulSoup(html, "html.parser")
    except KeyboardInterrupt:
        print('')
        print('Program interrupted by user...')
        break
    except:
        print("Unable to retrieve or parse page")
        cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
        conn.commit()
        continue

    cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) )
    cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) )
    conn.commit()

    # Retrieve all of the anchor tags
    tags = soup('a')
    count = 0
    for tag in tags:
        href = tag.get('href', None)
        if ( href is None ) : continue
        # Resolve relative references like href="/contact"
        up = urlparse(href)
        if ( len(up.scheme) < 1 ) :
            href = urljoin(url, href)
        ipos = href.find('#')
        if ( ipos > 1 ) : href = href[:ipos]
        if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue
        if ( href.endswith('/') ) : href = href[:-1]
        # print href
        if ( len(href) < 1 ) : continue

        # Check if the URL is in any of the webs
        found = False
        for web in webs:
            if ( href.startswith(web) ) :
                found = True
                break
        if not found : continue

        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) )
        count = count + 1
        conn.commit()

        cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, ))
        try:
            row = cur.fetchone()
            toid = row[0]
        except:
            print('Could not retrieve id')
            continue
        # print fromid, toid
        cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) )


    print(count)

cur.close()


How many pages:1
1 http://python-data.dr-chuck.net (1394) 6
How many pages:2
4 http://python-data.dr-chuck.net/comments_42.html (3521) 0
6 http://python-data.dr-chuck.net/comments_42.json Ignore non text/html page
How many pages:5
2 http://python-data.dr-chuck.net/geojson Ignore non text/html page
5 http://python-data.dr-chuck.net/comments_42.xml Ignore non text/html page
7 http://python-data.dr-chuck.net/known_by_42.html (12021) 100
105 http://python-data.dr-chuck.net/known_by_Tammara.html (12019) 100
62 http://python-data.dr-chuck.net/known_by_Daksh.html (12108) 100
How many pages:30
164 http://python-data.dr-chuck.net/known_by_Mercy.html (12047) 100
290 http://python-data.dr-chuck.net/known_by_Enis.html (12025) 100
101 http://python-data.dr-chuck.net/known_by_Khalan.html (12032) 100
510 http://python-data.dr-chuck.net/known_by_Murray.html (12053) 100
31 http://python-data.dr-chuck.net/known_by_Todd.html (12016) 100
396 http://python-data.dr-chuck.net/known_by_Tokunbo.html (12071) 10

In [None]:
# END